library(tidyverse)
# library(dict) # Still not found after installation
library(container) # For Dict class
library(useful) # For simple.impute
library(comprehenr) # For list comprehension
library(GGally)
library(reshape2)
library(gridExtra)
library(gplots)
library(DescTools) # For df summary
library(robustHD) # For df summary
library(caret)
library(effsize) # For Cohen's d
source('tools/wrangle.R')
source('tools/eda.R')
source('tools/engineer.R')
source('tools/split.R')
# SEED = 65466
Here’s the wrangled training set loaded from objects serialized at the end of wrangle_and_split.Rmd.
val_train_X = readRDS("data/val_train_X_wrangled.rds")
val_train_y = readRDS("data/val_train_y_wrangled.rds")
# Merge for easier analysis.
val_train_Xy = merge(
x = val_train_X,
y = val_train_y,
by = 'Id',
all = TRUE
)
str(val_train_Xy)
'data.frame': 715 obs. of 81 variables:
$ Id : chr "1" "10" "1000" "1002" ...
$ MSSubClass : Factor w/ 16 levels "20","30","40",..: 6 16 1 2 1 9 14 1 5 11 ...
$ MSZoning : Factor w/ 8 levels "A","C","FV","I",..: 6 6 6 6 6 6 8 6 6 6 ...
$ LotFrontage : num 65 50 64 60 75 65 21 NA 115 75 ...
$ LotArea : num 8450 7420 6762 5400 11957 ...
$ Street : Ord.factor w/ 3 levels "None"<"Grvl"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ Alley : Ord.factor w/ 3 levels "None"<"Grvl"<..: 1 1 1 1 1 1 1 1 1 1 ...
$ LotShape : Ord.factor w/ 4 levels "Reg"<"IR1"<"IR2"<..: 1 1 1 1 2 1 1 2 1 1 ...
$ LandContour : Factor w/ 4 levels "Lvl","Bnk","HLS",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Utilities : Ord.factor w/ 5 levels "None"<"ELO"<"NoSeWa"<..: 5 5 5 5 5 5 5 5 5 5 ...
$ LotConfig : Factor w/ 5 levels "Inside","Corner",..: 1 2 1 2 1 1 1 1 1 1 ...
$ LandSlope : Ord.factor w/ 4 levels "None"<"Gtl"<"Mod"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 4 6 18 22 6 11 17 20 8 ...
$ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 1 3 3 5 3 3 3 3 3 ...
$ Condition2 : Factor w/ 9 levels "Artery","Feedr",..: 3 1 3 3 3 3 3 3 3 3 ...
$ BldgType : Factor w/ 5 levels "1Fam","2FmCon",..: 1 2 1 1 1 1 4 1 1 3 ...
$ HouseStyle : Factor w/ 8 levels "1Story","1.5Fin",..: 4 3 1 1 1 8 4 1 2 1 ...
$ OverallQual : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 7 5 7 5 8 5 4 6 5 5 ...
$ OverallCond : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 6 5 6 5 8 4 7 5 5 ...
$ YearBuilt : int 2003 1939 2006 1920 2006 1977 1970 1977 1948 1965 ...
$ YearRemodAdd : int 2003 1950 2006 1950 2006 1977 1970 2001 1950 1965 ...
$ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 4 ...
$ RoofMatl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Exterior1st : Factor w/ 17 levels "AsbShng","AsphShn",..: 15 9 15 16 15 7 6 11 16 2 ...
$ Exterior2nd : Factor w/ 18 levels "AsbShng","AsphShn",..: 15 9 15 16 15 7 6 11 16 2 ...
$ MasVnrType : Factor w/ 5 levels "BrkCmn","BrkFace",..: 2 4 5 4 2 2 4 2 4 4 ...
$ MasVnrArea : num 196 0 24 0 53 220 0 28 0 0 ...
$ ExterQual : Ord.factor w/ 5 levels "Po"<"Fa"<"TA"<..: 4 3 4 3 4 4 3 3 3 3 ...
$ ExterCond : Ord.factor w/ 5 levels "Po"<"Fa"<"TA"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 1 3 1 3 2 2 3 2 2 ...
$ BsmtQual : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 5 4 5 3 5 5 4 4 4 1 ...
$ BsmtCond : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 4 4 4 4 4 5 4 4 4 1 ...
$ BsmtExposure : Ord.factor w/ 5 levels "None"<"No"<"Mn"<..: 2 2 4 2 2 4 2 3 2 1 ...
$ BsmtFinType1 : Ord.factor w/ 7 levels "None"<"Unf"<"LwQ"<..: 7 7 7 2 7 7 5 6 2 1 ...
$ BsmtFinSF1 : num 706 851 686 0 24 595 273 1200 0 0 ...
$ BsmtFinType2 : Ord.factor w/ 7 levels "None"<"Unf"<"LwQ"<..: 2 2 2 2 2 2 3 2 2 1 ...
$ BsmtFinSF2 : num 0 0 0 0 0 0 273 0 0 0 ...
$ BsmtUnfSF : num 150 140 501 691 1550 390 0 410 720 0 ...
$ TotalBsmtSF : num 856 991 1187 691 1574 ...
$ Heating : Factor w/ 7 levels "None","Floor",..: 3 3 3 3 3 3 3 3 3 3 ...
$ HeatingQC : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 6 6 6 6 6 4 4 5 4 4 ...
$ CentralAir : Ord.factor w/ 2 levels "N"<"Y": 2 2 2 2 2 2 2 2 2 1 ...
$ Electrical : Factor w/ 6 levels "None","SBrkr",..: 2 2 2 3 2 2 2 2 2 2 ...
$ X1stFlrSF : num 856 1077 1208 691 1574 ...
$ X2ndFlrSF : num 854 0 0 0 0 0 546 0 551 0 ...
$ LowQualFinSF : num 0 0 0 0 0 0 0 0 0 0 ...
$ GrLivArea : num 1710 1077 1208 691 1574 ...
$ BsmtFullBath : int 1 1 1 0 0 0 0 1 0 0 ...
$ BsmtHalfBath : int 0 0 0 0 0 0 0 0 0 0 ...
$ FullBath : int 2 1 2 1 2 2 1 2 2 2 ...
$ HalfBath : int 1 0 0 0 0 0 1 0 0 0 ...
$ BedroomAbvGr : int 3 2 2 2 3 3 3 3 4 4 ...
$ KitchenAbvGr : int 1 2 1 1 1 1 1 1 1 2 ...
$ KitchenQual : Ord.factor w/ 5 levels "Po"<"Fa"<"TA"<..: 4 3 4 5 4 3 3 4 3 3 ...
$ TotRmsAbvGrd : int 8 5 6 4 7 6 6 6 7 8 ...
$ Functional : Ord.factor w/ 8 levels "Sal"<"Sev"<"Maj2"<..: 8 8 8 8 8 8 8 8 8 8 ...
$ Fireplaces : int 0 2 0 0 1 0 0 2 1 0 ...
$ FireplaceQu : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 1 4 1 1 5 1 1 4 5 1 ...
$ GarageType : Factor w/ 7 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 2 7 ...
$ GarageYrBlt : int 2003 1939 2006 1920 2006 1977 1970 1977 1948 NA ...
$ GarageFinish : Ord.factor w/ 4 levels "None"<"Unf"<"RFn"<..: 3 3 3 2 3 2 3 3 2 1 ...
$ GarageCars : int 2 1 2 1 3 1 1 2 1 0 ...
$ GarageArea : num 548 205 632 216 824 328 286 480 312 0 ...
$ GarageQual : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 4 5 4 3 4 4 4 4 4 1 ...
$ GarageCond : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 4 4 4 4 4 4 4 4 4 1 ...
$ PavedDrive : Ord.factor w/ 4 levels "None"<"N"<"P"<..: 4 4 4 2 4 4 4 4 4 4 ...
$ WoodDeckSF : num 0 0 105 0 144 210 238 168 0 0 ...
$ OpenPorchSF : num 61 4 61 20 104 0 0 68 0 0 ...
$ EnclosedPorch: num 0 0 0 94 0 0 0 0 108 0 ...
$ X3SsnPorch : num 0 0 0 0 0 0 0 0 0 0 ...
$ ScreenPorch : num 0 0 0 0 0 0 0 0 0 0 ...
$ PoolArea : num 0 0 0 0 0 0 0 0 0 0 ...
$ PoolQC : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 1 1 1 1 1 1 1 1 1 1 ...
$ Fence : Factor w/ 5 levels "GdPrv","MnPrv",..: 5 5 5 5 5 5 5 5 5 5 ...
$ MiscFeature : Factor w/ 6 levels "Elev","Gar2",..: 6 6 6 6 6 6 6 6 6 6 ...
$ MiscVal : num 0 0 0 0 0 0 0 0 0 0 ...
$ MoSold : int 2 1 2 1 7 11 8 2 8 5 ...
$ YrSold : int 2008 2008 2010 2007 2008 2008 2009 2010 2008 2010 ...
$ SaleType : Factor w/ 10 levels "WD","CWD","VWD",..: 1 1 1 1 1 1 1 1 1 1 ...
$ SaleCondition: Factor w/ 6 levels "Normal","Abnorml",..: 1 1 1 2 1 1 1 1 1 1 ...
$ SalePrice : num 208500 118000 206000 86000 232000 ...
There are 715 total observations in the validation training set, across 81 features (target feature “Saleprice” and id column “Id” included). 46 features are factors, 24 of which are ordered. 14 are integers. 20 are doubles, including “SalePrice”. (“Id” is integers cast as a character type.)
The full correlation grid is too large for most screens, but there are only a handful of noteworthy correlations which I’ll include with further analysis of each feature.
# ggcorr(
# select(val_train_Xy, where(is.numeric)),
# label = T,
# label_round = 2,
# label_size = 3
# )
I wrote a simple algorithm to try various transformations on each continuous feature and use the Shapiro-Wilk Normality Test to choose the best transformation. I then visualize each feature and decide if it even makes sense to attempt normalization.
I should have made logarithmic transformations be that of x+1, but instead I excluded 0s, which only partially handled the issue. 1s still convert to 0s in that case. The result was that variables with a substantial number of 1s did not find logs very useful for normalization.
I also did not include more dynamic transformations like Box-Cox. The script could be modified to include them.
For variables in which a 0 indicates a missing feature, I normalized only the non-zero set. The idea was that it might aid regression when the variable is put into interaction with its missingness.
funcs_lst = list(
'no_func' = function (x) { x },
'sqrt' = sqrt,
'cbrt' = function(x) { x^(1/3) },
'square' = function(x) { x^2 },
###
### FIXME
# Make log transformations of x+1.
###
'log' = log,
'log2' = log2,
'log10' = log10,
'1/x' = function (x) { 1/x },
'2^(1/x)' = function (x) { 2^(1/x) }
# Box Cox: write function that calls MASS::boxcox and include lambda in results/function returned.
# Yeo-Johnson
# Winsorize here?
# Rank
# Rank-Gauss
)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
print("Best normalizing transformations:")
[1] "Best normalizing transformations:"
for (feat in names(best_normalizers)) {
func_name = best_normalizers[[feat]]$best_func$name
print(
paste(
feat, ":", func_name,
", p-value:", best_normalizers[[feat]]$results[[func_name]]$p.value
)
)
}
[1] "LotFrontage : log10 , p-value: 0.896193826437694"
[1] "LotArea : log10 , p-value: 2.26152919548748e-17"
[1] "YearBuilt : square , p-value: 0.0108822707033521"
[1] "YearRemodAdd : no_func , p-value: 0.0256596409501691"
[1] "MasVnrArea : cbrt , p-value: 0.954476102599975"
[1] "BsmtFinSF1 : cbrt , p-value: 4.83197463170721e-05"
[1] "BsmtFinSF2 : cbrt , p-value: 0.576669464478684"
[1] "BsmtUnfSF : cbrt , p-value: 0.00100793900881088"
[1] "TotalBsmtSF : log , p-value: 2.08019611846809e-06"
[1] "X1stFlrSF : log2 , p-value: 0.0309515511380178"
[1] "X2ndFlrSF : sqrt , p-value: 0.580431655244041"
[1] "LowQualFinSF : sqrt , p-value: 0.12996282240508"
[1] "GrLivArea : log2 , p-value: 0.0129611332961232"
[1] "BsmtFullBath : , p-value: "
[1] "BsmtHalfBath : , p-value: "
[1] "FullBath : , p-value: "
[1] "HalfBath : , p-value: "
[1] "BedroomAbvGr : sqrt , p-value: 0.995915265851324"
[1] "KitchenAbvGr : , p-value: "
[1] "TotRmsAbvGrd : no_func , p-value: 0.972016654577295"
[1] "Fireplaces : , p-value: "
[1] "GarageYrBlt : square , p-value: 0.00823549702716429"
[1] "GarageCars : no_func , p-value: 0.971877057620897"
[1] "GarageArea : sqrt , p-value: 0.203272774347792"
[1] "WoodDeckSF : cbrt , p-value: 0.991997742575695"
[1] "OpenPorchSF : cbrt , p-value: 0.898709370969093"
[1] "EnclosedPorch : no_func , p-value: 0.526291715345031"
[1] "X3SsnPorch : sqrt , p-value: 0.516253205278421"
[1] "ScreenPorch : cbrt , p-value: 0.608040279289975"
[1] "PoolArea : , p-value: "
[1] "MiscVal : log2 , p-value: 0.280656484036341"
[1] "MoSold : no_func , p-value: 0.875731443365881"
[1] "YrSold : no_func , p-value: 0.96717393596804"
[1] "SalePrice : log , p-value: 0.72923438230646"
x = 'SalePrice'
summary(val_train_Xy[[x]])
Min. 1st Qu. Median Mean 3rd Qu. Max.
39300 129950 160000 181697 212500 745000
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 5000,
t_binw = 1/50
)
NULL
val_train_Xy = val_train_Xy %>%
mutate('log(SalePrice)' = log(SalePrice))
x = 'log(SalePrice)'
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
log(SalePrice)
Min. :10.58
1st Qu.:11.77
Median :11.98
Mean :12.03
3rd Qu.:12.27
Max. :13.52
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/100,
t_binw = 1/100
)
NULL
A natural log best normalizes the sale price distribution. However, because it isn’t a log10 transformation, it won’t precisely scale the prediction errors proportionally to the sale price. I could simply use the log10 instead, which does a fair job at normalizing as well, but I want to stick with the “best” transformation to make the best model. So, when I run ML, I will write a custom summary function to train with which simply divides the error by the price (the log(SalePrice)) before calculating the RMSE.
Here I’m looking for the best Winsorization quantile values of the best transformation – the best according to Shapiro-Wilk p-values. I could have programmatically explored the space and returned the best result. But, I want to visualize it and explore the process itself. In future projects, I might choose to further automate this.
It should be noted that Winsorization of the target variable should only be used for training, not for testing. A log transformation can be reversed as a vectorized operation, but Winsorization can’t. Winsorization should improve the accuracy of the model, but would be cheating on the test.
qqnorm(y = val_train_Xy$SalePrice, ylab = 'SalePrice')
qqline(y = val_train_Xy$SalePrice, ylab = 'SalePrice')
qqnorm(y = val_train_Xy$`log(SalePrice)`, ylab = 'log(SalePrice)')
qqline(y = val_train_Xy$`log(SalePrice)`, ylab = 'log(SalePrice)')
Win_log_x = Winsorize(
x = val_train_Xy[['log(SalePrice)']],
probs = c(0.005, 0.995)
)
qqnorm(y = Win_log_x, ylab = 'Win_log_x')
qqline(y = Win_log_x, ylab = 'Win_log_x')
Win_raw_x = Winsorize(
x = val_train_Xy[['SalePrice']],
probs = c(0, 0.95)
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$SalePrice))
Shapiro-Wilk normality test
data: val_train_Xy$SalePrice
W = 0.86089, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log(SalePrice)`))
Shapiro-Wilk normality test
data: val_train_Xy$`log(SalePrice)`
W = 0.9904, p-value = 0.0001299
print(shapiro.test(x = Win_log_x))
Shapiro-Wilk normality test
data: Win_log_x
W = 0.99062, p-value = 0.0001609
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.93275, p-value < 2.2e-16
A small Winsorization of the log best normalizes the variable (W = 0.99062). It doesn’t pass the test for normality (p < 0.01), but it is still better prepared for a linear regression.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(SalePrice)' = Winsorize(
SalePrice,
probs = c(0, 0.95),
na.rm = T
)
) %>%
mutate(
'Win(log(SalePrice))' = Winsorize(
log(SalePrice),
probs = c(0.005, 0.995),
na.rm = T
)
)
Here are the correlations between SalePrice and the rest of the variables, compared to those of the transformed variables. Transforming SalePrice resulted in minor increases of correlations to many other continuous features and some minor reductions of correlations, an overall minor and insignificant improvement. But, this is the first of the variables to be transformed.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x = 'Win(log(SalePrice))'
x_lst = c('SalePrice', 'log(SalePrice)', 'Win(log(SalePrice))',
'Win(SalePrice)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
SalePrice log(SalePrice) Win(log(SalePrice))
Min. :0.007746 Min. :0.001275 Min. :0.001406
1st Qu.:0.124877 1st Qu.:0.130580 1st Qu.:0.131486
Median :0.306100 Median :0.314440 Median :0.314346
Mean :0.310147 Mean :0.322002 Mean :0.322153
3rd Qu.:0.520650 3rd Qu.:0.538367 3rd Qu.:0.539042
Max. :0.689569 Max. :0.687804 Max. :0.686782
Win(SalePrice)
Min. :0.006338
1st Qu.:0.116871
Median :0.312263
Mean :0.317893
3rd Qu.:0.515569
Max. :0.686731
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
I’ll need to hard code the top and bottom limits for the engineering script to apply to the test set without leakage. And, I’ll need to drop Win(SalePrice).
I want to keep the transformed and non-Winsorized version though. In the case of the target variable, I’ll train on the Winsorized variable and test on the transformed because I can reverse the transformation. In the case of predictor variables, the Winsorized version will be good for basic linear regression without interactions, but not necessarily for KNN and RF which may be able to use the outliers for clustering and grouping.
x = 'Win(log(SalePrice))'
min_val = min(val_train_Xy[[x]])
max_val = max(val_train_Xy[[x]])
print(paste("min_val:", min_val))
[1] "min_val: 10.9579480025541"
print(paste("max_val:", max_val))
[1] "max_val: 13.2279465702719"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(log(SalePrice))' = Winsorize(
.data[['log(SalePrice)']],
minval = min_val,
maxval = max_val
)
) %>%
select(-c('Win(SalePrice)'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1/50)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
To aid visualization, I’ll create a SalePrice factor with extremes and quartiles as levels.
val_train_Xy = val_train_Xy %>%
mutate(
'SalePrice.fact' = cut(
x = SalePrice,
breaks = quantile(x = SalePrice),
include.lowest = T,
ordered_result = T
)
)
summary(val_train_Xy$SalePrice.fact)
[3.93e+04,1.3e+05] (1.3e+05,1.6e+05] (1.6e+05,2.12e+05]
179 179 178
(2.12e+05,7.45e+05]
179
I’ll use log(SalePrice) to visualize against factors, rather than the Winsorized version.
x = 'MSSubClass'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "60" "20" "30" "80" "160" "50" "70" "120"
The most common class is 1-story newer than 1945 (267), followed by 2-story newer than 1945 (149) and 1.5-story finished all ages (67). The priciest class is 2-story newer than 1945, though some classes are so uncommon that it’s hard to say this completely confidently.
This feature is a mix of information mostly covered by HouseStyle, YearBuilt, and square footage. It might be worth dropping it to avoid overweighting this info, avoid spurious fit, and skip the compute cost of 16 one-hot features. Alternatively, decomposition with PCA might help pull out the unique information regarding PUD housing.
x = 'MSZoning'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "RL" "RM" "FV"
Mostly residential low density (574), some medium density (100), fewer floating village residential (flexible zoning, 31). Predictive power may be limited due to lack of diversity. That said, low-density residential and floating village clearly tend to sell for more than medium-density. Consider only one-hot encoding RL, RM, and FV?
x = 'LotFrontage'
summary(val_train_Xy[[x]])
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
21.00 58.00 70.00 70.04 80.00 313.00 133
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 5,
t_binw = 1/50
)
NULL
A log10 scale centers it better (133 missing values excluded).
Note the extreme spike (~65 observations) in left of mode (70-75 SF) at 60 SF. It doesn’t seem to be associated with any particular neighborhood or lot configuration or anything, but probably just a common way to cut lots.
The feature could benefit from top/bottom coding.
133 NAs. Counterintuitively, a lower proportion of missing LotFrontages are inside lots (55/121 in the NA subset vs. 511/715 in the training set [these numbers are from a previous split and not accurate for the current data set]), whereas many lots that by definition have frontage (44 corner lots, FR2, and FR3) are missing frontage values. Could use LotArea, LotShape, LotConfig, and (?) (all of which aren’t missing values) to multivariate impute.
x = 'log10(LotFrontage)'
val_train_Xy = val_train_Xy %>%
mutate(
'log10(LotFrontage)' = ifelse(
LotFrontage == 0,
0,
log10(LotFrontage)
)
)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
log10(LotFrontage)
Min. :1.322
1st Qu.:1.763
Median :1.845
Mean :1.820
3rd Qu.:1.903
Max. :2.496
NA's :133
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/50,
t_binw = 1/50
)
NULL
qqnorm(y = val_train_Xy$LotFrontage, ylab = 'LotFrontage')
qqline(y = val_train_Xy$LotFrontage, ylab = 'LotFrontage')
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_log10_x = Winsorize(
x = val_train_Xy[[x]],
probs = c(0.05, 0.99),
na.rm = T
)
qqnorm(y = Win_log10_x, ylab = 'Win_log10_x')
qqline(y = Win_log10_x, ylab = 'Win_log10_x')
Win_raw_x = Winsorize(
x = val_train_Xy$LotFrontage,
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win(LotFrontage)')
qqline(y = Win_raw_x, ylab = 'Win(LotFrontage)')
print(shapiro.test(x = val_train_Xy$LotFrontage))
Shapiro-Wilk normality test
data: val_train_Xy$LotFrontage
W = 0.87621, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy[[x]]))
Shapiro-Wilk normality test
data: val_train_Xy[[x]]
W = 0.94238, p-value = 3.051e-14
print(shapiro.test(x = Win_log10_x))
Shapiro-Wilk normality test
data: Win_log10_x
W = 0.97393, p-value = 1.16e-08
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.9771, p-value = 6.718e-08
It looks like just Winsorizing the raw variable may be the way to go here.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(LotFrontage)' = Winsorize(
LotFrontage,
probs = c(0.05, 0.95),
na.rm = T
)
) %>%
mutate(
'Win(log10(LotFrontage))' = Winsorize(
log10(LotFrontage),
probs = c(0.05, 0.99),
na.rm = T
)
)
x = 'Win(LotFrontage)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('LotFrontage', 'log10(LotFrontage)', 'Win(log10(LotFrontage))', 'Win(LotFrontage)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
LotFrontage log10(LotFrontage) Win(log10(LotFrontage))
Min. :0.001316 Min. :0.002122 Min. :0.002136
1st Qu.:0.052789 1st Qu.:0.036725 1st Qu.:0.036057
Median :0.152575 Median :0.123545 Median :0.121036
Mean :0.189170 Mean :0.163383 Mean :0.162507
3rd Qu.:0.318210 3rd Qu.:0.285571 3rd Qu.:0.300117
Max. :0.515867 Max. :0.466945 Max. :0.430664
Win(LotFrontage)
Min. :0.00368
1st Qu.:0.05062
Median :0.13177
Mean :0.17398
3rd Qu.:0.31994
Max. :0.44127
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(LotFrontage)'
min_val = min(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
max_val = max(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
print(paste("min_val:", min_val))
[1] "min_val: 34.05"
print(paste("max_val:", max_val))
[1] "max_val: 108"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(LotFrontage)' = Winsorize(
LotFrontage,
minval = min_val,
maxval = max_val
)
)
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
y_lst = c('MSSubClass', 'MSZoning', 'LotShape', 'LotConfig', 'Neighborhood',
'BldgType', 'HouseStyle')
for (y in y_lst) {
plt = fenced_jbv(
data = val_train_Xy, x = y, y = 'log10(LotFrontage)') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plt)
}
Overall, the clusters of lots at 60’ and 80’ is quite apparent.
Unsurprisingly, low-density residential tends to have more lot frontage than medium-density residential. Slightly irregular lots might tend to have more frontage than regular, but we can’t say that with much confidence. Corner lots have more frontage than inside lots. There’s quite a bit of variation between neighborhoods.
Looking at MSSubClass, older homes tend to have less lot frontage than their equivalent house styles after 1945, unless they are PUD homes, which tend to have much less frontage than the rest of the classes. This connection is not visible in YearBuilt, which has no correlation to LotFrontage.
There’s also an interesting gap between 50’ and 60’ that only homes older than 1945 tend to fill, except for PUD homes.
fenced_jbv(
data = val_train_Xy,
x = 'MSSubClass',
y = 'log10(LotFrontage)',
jit_col = 'SalePrice.fact',
leg_lbl = 'SalePrice',
jit_alpha = 0.5,
box_color = 'red'
)
ggplot(val_train_Xy, aes(x = `log10(LotFrontage)`, y = `log(SalePrice)`)) +
geom_point(alpha = 0.5) +
geom_smooth(method = 'lm') +
facet_wrap(vars(MSSubClass), ncol = 5)
x = 'LotArea'
summary(val_train_Xy[[x]])
Min. 1st Qu. Median Mean 3rd Qu. Max.
1596 7610 9477 10346 11611 115149
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 200,
t_binw = 1/50
)
NULL
x_trans = 'log10(LotArea)'
val_train_Xy = val_train_Xy %>%
mutate('log10(LotArea)' = log10(LotArea))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x_trans])
log10(LotArea)
Min. :3.203
1st Qu.:3.881
Median :3.977
Mean :3.960
3rd Qu.:4.065
Max. :5.061
sum_and_trans_cont(
data = val_train_Xy,
x = x_trans,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/50,
t_binw = 1/500
)
NULL
x_trans = 'log10(log10(LotArea))'
val_train_Xy = val_train_Xy %>%
mutate('log10(log10(LotArea))' = log10(log10(LotArea)))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x_trans])
log10(log10(LotArea))
Min. :0.5056
1st Qu.:0.5890
Median :0.5995
Mean :0.5970
3rd Qu.:0.6090
Max. :0.7043
sum_and_trans_cont(
data = val_train_Xy,
x = x_trans,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/500,
t_binw = 1/750
)
NULL
I doubt it’s worth doing the third log10 transformation now that the median and mean are so close. It still needs top- and bottom-coding anyway.
Even the second transformation might lead to overfit, but I’ll roll with it.
qqnorm(y = val_train_Xy$LotArea, ylab = 'LotArea')
qqline(y = val_train_Xy$LotArea, ylab = 'LotArea')
qqnorm(y = val_train_Xy$`log10(LotArea)`, ylab = 'log10(LotArea)')
qqline(y = val_train_Xy$`log10(LotArea)`, ylab = 'log10(LotArea)')
qqnorm(
y = val_train_Xy$`log10(log10(LotArea))`,
ylab = 'log10(log10(LotArea))'
)
qqline(
y = val_train_Xy$`log10(log10(LotArea))`,
ylab = 'log10(log10(LotArea))'
)
Win_log10log10_x = Winsorize(
x = val_train_Xy$`log10(log10(LotArea))`,
probs = c(0.05, 0.99),
na.rm = T
)
qqnorm(y = Win_log10log10_x, ylab = 'Win(log10(log10(LotArea)))')
qqline(y = Win_log10log10_x, ylab = 'Win(log10(log10(LotArea)))')
Win_log10_x = Winsorize(
x = val_train_Xy$`log10(LotArea)`,
probs = c(0.05, 0.99),
na.rm = T
)
qqnorm(y = Win_log10_x, ylab = 'Win(log10(LotArea))')
qqline(y = Win_log10_x, ylab = 'Win(log10(LotArea))')
Win_raw_x = Winsorize(
x = val_train_Xy$LotArea,
probs = c(0.01, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'LotArea')
qqline(y = Win_raw_x, ylab = 'LotArea')
print(shapiro.test(x = val_train_Xy$LotArea))
Shapiro-Wilk normality test
data: val_train_Xy$LotArea
W = 0.5211, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log10(LotArea)`))
Shapiro-Wilk normality test
data: val_train_Xy$`log10(LotArea)`
W = 0.9113, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log10(log10(LotArea))`))
Shapiro-Wilk normality test
data: val_train_Xy$`log10(log10(LotArea))`
W = 0.90238, p-value < 2.2e-16
print(shapiro.test(x = Win_log10log10_x))
Shapiro-Wilk normality test
data: Win_log10log10_x
W = 0.94232, p-value = 4.825e-16
print(shapiro.test(x = Win_log10_x))
Shapiro-Wilk normality test
data: Win_log10_x
W = 0.94434, p-value = 9.78e-16
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.97759, p-value = 5.239e-09
It looks like simply Winsorizing the base variable might be best.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(LotArea)' = Winsorize(
LotArea,
probs = c(0.01, 0.95),
na.rm = T
)
) %>%
mutate(
'Win(log10(LotArea))' = Winsorize(
log10(LotArea),
probs = c(0.05, 0.99),
na.rm = T
)
)
Transforming LotArea with log10 resulted in bigger swings in r in both directions, but no real change in aggregate. The additional log10 transformation produced minor changes in correlation compared to the initial transformation, mostly toward less correlation.
Unsurprisingly, transformed LotArea is much more correlated to transformed LotFrontage than their untransformed counterparts (r went from 0.51 to 0.73). Also, transformed LotArea and transformed SalePrice correlate a little better than their untransformed counterparts, but are still weakly correlated (r increased from 0.30 to 0.37).
It also became noticeably more correlated to square footage of the first floor, bedrooms / total rooms above ground, and garage area/cars. Like previous transformations, some correlations lessened with this transformation, but none so much that the correlation dropped a bracket, e.g. from weak to insignificant.
The distribution of correlations dropped with the second transformation. I’ll only use the first transformation in the engineering script.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('LotArea', 'log10(LotArea)', 'log10(log10(LotArea))', 'Win(log10(LotArea))', 'Win(LotArea)')
x = 'Win(LotArea)'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(df)
LotArea log10(LotArea) log10(log10(LotArea))
Min. :0.0008156 Min. :0.001463 Min. :0.001822
1st Qu.:0.0298980 1st Qu.:0.038420 1st Qu.:0.038591
Median :0.1411549 Median :0.128146 Median :0.125858
Mean :0.1692131 Mean :0.210784 Mean :0.208681
3rd Qu.:0.2935533 3rd Qu.:0.351718 3rd Qu.:0.343608
Max. :0.5149010 Max. :0.728610 Max. :0.741522
Win(log10(LotArea)) Win(LotArea)
Min. :0.005534 Min. :0.0001219
1st Qu.:0.045043 1st Qu.:0.0704055
Median :0.124050 Median :0.1270071
Mean :0.216047 Mean :0.2244330
3rd Qu.:0.359245 3rd Qu.:0.3686740
Max. :0.697739 Max. :0.6861456
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(LotArea)'
min_val = min(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
max_val = max(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
print(paste("min_val:", min_val))
[1] "min_val: 1975.96"
print(paste("max_val:", max_val))
[1] "max_val: 16946.4"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(LotArea)' = Winsorize(
LotArea,
minval = min_val,
maxval = max_val
)
) %>%
select(-c('log10(LotArea)', 'Win(log10(LotArea))'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 100)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
y_lst = c('log(SalePrice)', 'Win(LotFrontage)', 'X1stFlrSF',
'GrLivArea','TotRmsAbvGrd', 'GarageArea')
x = 'Win(LotArea)'
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
NULL
y_lst = c('MSSubClass', 'MSZoning', 'LotShape', 'LotConfig', 'Neighborhood',
'BldgType', 'HouseStyle')
for (y in y_lst) {
plt = fenced_jbv(
data = val_train_Xy,
x = y,
y = 'log10(log10(LotArea))',
jit_h = 0 # Again R randomly decides to go wonk unless I enter the default.
) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
print(plt)
}
There are similar patterns as in LotFrontage, with a more marked upward trend against LotShape, and with less difference between lot configurations. There’s also an interesting pocket of two-story houses with low lot area; it’s not worth plotting, but you can see which neighborhoods these are.
Really lopsided to paved (3 gravel, 712 paved). Drop this feature.
summary(val_train_Xy$Street)
None Grvl Pave
0 3 712
val_train_Xy = select(val_train_Xy, -c('Street'))
Vast majority have none, drop it.
summary(val_train_Xy$Alley)
None Grvl Pave
669 24 22
val_train_Xy = select(val_train_Xy, -c('Alley'))
x = 'LotShape'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Reg" "IR1"
Irregularly shaped lots tend to sell for more. Good candidate for binarization if doing a basic linear regression with no interactions; one-hot encode ‘Reg’ and drop the rest of the levels.
summary(val_train_Xy$LandContour)
Lvl Bnk HLS Low
642 33 24 16
val_train_Xy = select(val_train_Xy, -c('LandContour'))
All all-public. Definitely drop.
summary(val_train_Xy$Utilities)
None ELO NoSeWa NoSewr AllPub
0 0 0 0 715
val_train_Xy = select(val_train_Xy, -c('Utilities'))
x = 'LotConfig'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Inside" "Corner" "CulDSac"
This set is mostly inside lots (513) but plenty of corner lots (124). I’ve always thought corner lots are prized, but it doesn’t seem to significantly add to price compared to an inside lot.
Cul de sacs are the priciest. This is somewhat a proxy for neighborhood (and maybe other features) as it is true across most neighborhoods except those that are already pricey (where cul de sacs are relatively more common) or least pricey (where cul de sacs don’t typically exist).
ggplot(
data = val_train_Xy,
# data = filter(
# val_train_Xy,
# LotConfig %in% c('Inside', 'Corner', 'CulDSac')
# ),
mapping = aes(
x = Neighborhood,
y = `log(SalePrice)`,
)
) +
geom_jitter(
alpha = .4,
mapping = aes(color = LotConfig, shape = LotConfig)
) +
geom_boxplot(notch = T, varwidth = T, alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(
data = val_train_Xy,
mapping = aes(
x = Neighborhood,
y = `log(SalePrice)`,
)
) +
geom_jitter(
alpha = .3,
mapping = aes(color = LotShape, shape = LotShape)
) +
geom_boxplot(notch = T, varwidth = T, alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(
data = val_train_Xy,
mapping = aes(
x = LotConfig,
y = `log(SalePrice)`,
)
) +
geom_jitter(
alpha = .4,
mapping = aes(color = LotShape, shape = LotShape)
) +
geom_boxplot(notch = T, varwidth = T, alpha = 0)
ggplot(
data = val_train_Xy,
mapping = aes(
x = LotShape,
y = `log(SalePrice)`,
)
) +
geom_jitter(
alpha = .4,
mapping = aes(color = LotConfig, shape = LotConfig)
) +
geom_boxplot(notch = T, varwidth = T, alpha = 0)
There appears to be some overlap with lot shape and lot configuration.
Vast majority are gentle drop it.
summary(val_train_Xy$LandSlope)
None Gtl Mod Sev
0 677 32 6
val_train_Xy = select(val_train_Xy, -c('LandSlope'))
x = 'Neighborhood'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
keysize = 2
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "CollgCr" "BrkSide" "OldTown" "Somerst" "NWAmes" "Sawyer" "Edwards"
[8] "Crawfor" "NridgHt" "Gilbert" "Names"
North Ames (Names) and CollgCr have the most residential homes (107 and 76). but they’re also zoned low-density. A handful of neighborhoods have almost no houses in this set.
The priciest neighborhoods are StoneBr, NridgeHt, and NoRidge. The least pricey (OldTown, MeadowV, and IDOTRR) are also the most dense and commercial. There are significant differences in prices between many neighborhoods, and not just between the cheapest and priciest.
ggplot(
data = val_train_Xy,
aes(x = Neighborhood, fill = MSZoning)
) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The vast majority are normal. Drop Condition2. But, for Condition1, there appear to be significant differences in SalePrice between Norm and Feedr, and maybe between Norm and Artery but there are too few Artery observations. It might be worth one-hot-encoding those categories.
x = 'Condition1'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Norm" "Feedr"
You might consider binarizing, lumping ‘Feedr’ and ‘Artery’ and maybe ‘RRAe’ together and lumping the rest with ‘Norm’. I’ll try it out during feature selection with caret in the ML phase.
summary(val_train_Xy$Condition2)
Artery Feedr Norm RRNn RRAn PosN PosA RRNe RRAe
1 2 709 1 0 0 1 0 1
val_train_Xy = select(val_train_Xy, -c('Condition2'))
x = 'BldgType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
character(0)
Majority single-family (603), 24 NAs. It seems like an inherently important feature, despite the lopsidedness of the distribution. Probably safe to impute to mode (1Family), but other features might inform, such as MSSubClass, MSZoning, Neighborhood, HouseStyle, and building materials; multivariate impute might be in order, if keeping the feature in the first place.
Duplexes and two-family are significantly cheaper than single-family and townhouses, if accepting the low number in the sample. Candidate for binarization, but may lose interactions.
x = 'HouseStyle'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "2Story" "1Story" "SLvl" "1.5Fin"
Mostly one-story (358), but many two-story (220). Several significant price differences across groups. Maybe worth keeping, but also kind of noisy with the finished/unfinished business only applying to half-stories, and number of stories and finished status are encoded in other features.
x = 'OverallQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "7" "5" "8" "4" "6"
There’s a very strong relationship to SalePrice.
It’s a pretty normal distribution, slightly left-skewed. Mode (2199 5s) left of median/mean (180 6s), few 1s, 2s, and 3s. It might be worth casting as an integer and transforming to normalize and possibly improve the correlation.
x = 'OverallQual_int'
val_train_Xy = val_train_Xy %>%
mutate(OverallQual_int = as.integer(OverallQual))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
OverallQual_int
Min. : 1.000
1st Qu.: 5.000
Median : 6.000
Mean : 6.092
3rd Qu.: 7.000
Max. :10.000
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
None of the transformations improved its distribution.
Here are the correlations between OverallQual_int and the rest of the variables.
This feature has several moderate correlations to features having to do with size and age. It also has a strong correlation to transformed SalePrice (0.82).
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
OverallQual_int
Min. :0.01751
1st Qu.:0.13440
Median :0.27000
Mean :0.31424
3rd Qu.:0.54465
Max. :0.82099
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF',
'GrLivArea','FullBath', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageCars',
'GarageArea')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
NULL
Again, there’s a lot of interaction with MSSubClass and Neighborhood. Quality also decreases with zoning density. Two-story houses are generally rated higher than one-story houses. Vinyl gets rated highest among exteriors. Simply having masonry improves quality rating. Poured concrete foundations on average receive 7s, whereas cinder block and brick/tile receive 5s on average. The same is true for attached and built-in garages compared to detached and no garages. It almost looks like it’s better to have no fence than to have one with minor privacy. Court officer deeds/estates are rated more poorly than warranty deeds and new sales. Abnormal sales are rated lower than normal sales.
Overall quality is doing a lot of the work for other features toward predicting price.
y_lst = c('MSSubClass', 'MSZoning', 'Neighborhood', 'BldgType', 'HouseStyle',
'RoofMatl', 'RoofStyle', 'Exterior1st', 'MasVnrType', 'Foundation',
'Heating', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
'Fence', 'MiscFeature', 'SaleType', 'SaleCondition')
for (y in y_lst) {
plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
print(plt)
}
x = 'OverallCond'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "5" "6" "8" "4" "7"
OverallCond is like OverallQual, but with a much more pronounced mode (397 5s) left of median/mean (124 6s), probably due to wear and tear being more universal than quality construction.
The correlation to SalePrice is weaker, and 5s oddly seem to sell for more on average than higher-rated houses.
x = 'OverallCond_int'
val_train_Xy = val_train_Xy %>%
mutate(OverallCond_int = as.integer(OverallCond))
# Recalculate best normalizers. Might as well do them all, see if previous
# transformations benefit from further transformation, while we're at it.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
OverallCond_int
Min. :1.00
1st Qu.:5.00
Median :5.00
Mean :5.55
3rd Qu.:6.00
Max. :9.00
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
This feature is only weakly correlated to a couple of age features. It has no linear correlation to SalePrice.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
OverallCond_int
Min. :0.0008734
1st Qu.:0.0201988
Median :0.0386786
Mean :0.0676805
3rd Qu.:0.1009599
Max. :0.3591485
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
NULL
The bump in price among houses of average condition seems to have to do with a cluster of houses made in the late ’90s and early 2000s.
ggplot(
data = val_train_Xy,
mapping = aes(
x = OverallCond_int,
y = .data[['log(SalePrice)']],
color = YearBuilt
)
) +
geom_jitter(alpha = 0.5) +
geom_smooth() #+
# facet_wrap(vars(BldgType))
OldTown seems to be in relatively good shape, while Edwards has proportionately more houses in need of work and reconditioning. You can easily spot clustered 5s in the neighborhoods that likely grew up in the building boom of the 2000s.
y_lst = c('MSSubClass', 'MSZoning', 'Neighborhood', 'Condition1', 'BldgType',
'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType',
'BsmtExposure', 'Foundation', 'Heating', 'Electrical', 'Functional',
'GarageType', 'GarageFinish', 'Fence', 'MiscFeature', 'SaleType',
'SaleCondition')
for (y in y_lst) {
plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
print(plt)
}
Houses with metal and wood exteriors are typically in better condition than those with vinyl despite houses with vinyl siding typically being rated as higher quality and newer. Likewise, houses with cinder block foundations seem to fare better over time than those with poured concrete despite quality ratings, according to this data set, as do houses with detached garages compared to those with attached and built-in garages. Is this because older houses and lower-quality houses that are still standing are the ones that have received better maintenance?
p1 = ggplot(
data = val_train_Xy,
mapping = aes(
y = OverallCond_int,
x = Exterior1st,
color = YearBuilt
)
) +
geom_jitter() +
geom_boxplot(alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
p2 = ggplot(
data = val_train_Xy,
mapping = aes(
y = OverallCond_int,
x = Foundation,
color = YearBuilt
)
) +
geom_jitter() +
geom_boxplot(alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
p3 = ggplot(
data = val_train_Xy,
mapping = aes(
y = OverallCond_int,
x = GarageType,
color = YearBuilt
)
) +
geom_jitter() +
geom_boxplot(alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
grid.arrange(p1, p2, p3, ncol = 2)
No transformations normalized the distribution. You can see the construction boom between WWII and the ’80s (with a dip in mid ’70s stagflation), followed by the relative explosion starting in the late ’90s and dropping with the housing crisis in the 2000s.
Consider grouping around modes into a factor and dummy coding to accommodate some ML algorithms that wouldn’t handle a polymodal distribution well.
x = 'YearBuilt'
summary(val_train_Xy[x])
YearBuilt
Min. :1872
1st Qu.:1954
Median :1972
Mean :1971
3rd Qu.:2000
Max. :2010
# sum_and_trans_cont(
# data = val_train_Xy,
# x = x,
# func = best_normalizers[[x]]$best_func$func,
# func_name = best_normalizers[[x]]$best_func$name,
# x_binw = 1,
# t_binw = 1
# )
ggplot(val_train_Xy, aes(x = YearBuilt)) +
geom_histogram(binwidth = 1)
x = 'YearBuilt'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
YearBuilt
Min. :0.0009914
1st Qu.:0.0534053
Median :0.1616352
Mean :0.2334731
3rd Qu.:0.3684433
Max. :0.8301510
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = colnames(select(val_train_Xy, where(is.numeric)))
for (y in y_lst) {
for (x in x_lst) {
plt = ggplot(
select(val_train_Xy, all_of(c(x, y))),
aes(x = .data[[x]], y = .data[[y]])
) +
geom_jitter() +
geom_smooth() +
labs(x = x, y = y) +
scale_x_continuous(breaks = seq(1880, 2010, 5)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(plt)
}
}
y_lst = colnames(select(val_train_Xy, where(is.factor)))
for (y in y_lst) {
plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plt)
}
Before looking at remodel year, this is the story I gather about houses purchased in this period based on the above visualizations of YearBuilt. I could try to weave the viz in with the narrative, but I’m not going to.
Houses and garages have gotten bigger over the years, with more bathrooms and less enclosed porch space. (I’m curious to see whether the increasing rarity of enclosed porch space led to it being a prized (priced) feature, or if it simply isn’t as valued now.) Newer homes tend to be valued more, and builders generally began to produce higher-quality houses.
Most of the houses built in the post-WWII boom were single-story, but that’s also when 2 1/2 stories became unheard-of. Split/multi-level and duplexes began to come on the scene. 1 1/2 stories had their heyday. Some of the houses from then and earlier make up the group of two-family conversions. It would be interesting to check remodel years to see when those conversions tended to take place.
As the century turned, prices grew exponentially. Two-story houses and PUD housing became more prevalent, and townhouses appeared for the first time. Zoning became less dense as new suburbs sprang up. Lots became more irregular. Cul de sacs and parks started to sprinkle in as feeder streets networked out. A handful of houses began to appear near railroads.
Even as masonry became a growing bling factor driving up overall quality, the age of plastic ushered in vinyl siding as the dominant exterior. The overall condition of houses from the turn of the century on is starkly average compared to older houses which are commonly in good shape. Checking against remodel years will likely explain some of this, but I’m curious to compare exterior conditions of different materials with regard to age.
The Lost Generation got brick and tile foundations. Boomers got cinder block and slabs. Gen x and Millenials got poured concrete. This enabled us to build into the sides of hills better and have more exposed basements with walkouts.
Most of the basementless houses were built during the post-WWII boom when ramblers were a common answer to the burgeoning dream of homeownership. Basement area continuously grew from then on, especially finished basement area as people began to live more underground. Kitchens and bedrooms began to be more common below ground.
Kitchens became more of a target for adding value with improved materials, construction, and appliances. Heating systems improved over the years as well. New electrical standards came into place in 1960.
When the ’80s hit, the automobiled society was in full swing, and it was rare to see less than a two-car garage built anymore, let alone a house without a garage. Rather than a detached secondary building, garages became part of the main house itself. It was more often a finished space for more than just housing and working on cars, but without the need for high-quality construction. Unpaved driveways were now out of the question, though.
After 2000, virtually no new houses had fences, at least none that were bought during the years this data set covers.
I’ll condense the housing booms around their modes in a factor that I can one-hot encode to accommodate some ML algorithms that wouldn’t handle a polymodal distribution well. To maintain the model going forward, new periods will have to be identified and added.
# Three periods: < 1945 < 1985ish < 2010
val_train_Xy = val_train_Xy %>%
mutate(
YearBuilt.fact = cut(
x = YearBuilt,
breaks = c(-Inf, 1944, 1986, Inf),
ordered_result = T,
labels = c('Before_1945', '1945_1986', 'After_1986')
)
)
p1 = ggplot(val_train_Xy, aes(x = YearBuilt)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = seq(1880, 2010, 5)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
p2 = ggplot(val_train_Xy, aes(x = YearBuilt.fact)) +
geom_bar()
grid.arrange(p1, p2)
x = 'YearBuilt.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "After_1986" "Before_1945" "1945_1986"
It might make sense to transform year built into the age of the house when bought. The years of sales is a small range, but it may add a touch more predictive information, especially for the newer houses.
It seems to have added a couple of outliers, but applying a square-root transformation better centers it and removes the outliers.
0s don’t indicate a missing feature, so I want to include them in the search for the best normalizer.
val_train_Xy = val_train_Xy %>%
mutate('Age' = (YrSold - YearBuilt))
x = 'Age'
# Include 0s. Just recalculate rather than rewrite code.
num_feats = c(x)
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(-1)
)
summary(val_train_Xy[x])
Age
Min. : 0.00
1st Qu.: 8.00
Median : 35.00
Mean : 36.83
3rd Qu.: 55.00
Max. :136.00
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1/10
)
NULL
x = 'sqrt(Age)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(Age)' = sqrt(Age))
# Include 0s. Just recalculate rather than rewrite code.
num_feats = c(x)
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(-1)
)
summary(val_train_Xy[x])
sqrt(Age)
Min. : 0.000
1st Qu.: 2.828
Median : 5.916
Mean : 5.341
3rd Qu.: 7.416
Max. :11.662
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/10,
t_binw = 1/10
)
NULL
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('YearBuilt', 'Age', 'sqrt(Age)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
YearBuilt Age sqrt(Age)
Min. :0.0009914 Min. :0.00325 Min. :0.000536
1st Qu.:0.0534053 1st Qu.:0.06282 1st Qu.:0.081377
Median :0.1616352 Median :0.15871 Median :0.165492
Mean :0.2334731 Mean :0.23488 Mean :0.248951
3rd Qu.:0.3684433 3rd Qu.:0.36838 3rd Qu.:0.359191
Max. :0.8301510 Max. :0.82972 Max. :0.847412
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)', 'Win(log(SalePrice))', 'GarageArea', 'GarageCars',
'EnclosedPorch', 'OpenPorchSF', 'OverallQual_int', 'OverallCond_int')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
NULL
There’s a small increase in linear correlation of Age to transformed SalePrice from YearBuilt (0.59 to 0.62). Correlations to some measures of size strengthened. The correlation to quality rose quite a bit (from 0.59 to 0.65) while the correlation to condition only rose slightly (0.36 to 0.37), remaining weak.
With YearBuilt, there’s still a polynomial appearance to the plot against log(SalePrice), but that’s smoothed out with sqrt(Age). YearRemodAdd may add clarity.
The concentration of average-condition houses among the youngest is still puzzling. It’s as if a few years of aging tends to improve the condition of a house. Perhaps owners tend to add cosmetic value in the first years. Maybe assessors use the youngest houses as the anchor by which to assess all others. Many of these houses were unfinished, which explains some but not all of this.
ggplot(
val_train_Xy,
aes(
x = .data[['sqrt(Age)']],
y = OverallCond_int,
shape = SaleCondition,
color = SaleCondition
)
) +
geom_jitter()
val_train_Xy = select(val_train_Xy, -c('Age'))
x = 'YearRemodAdd'
summary(val_train_Xy[x])
YearRemodAdd
Min. :1950
1st Qu.:1966
Median :1994
Mean :1985
3rd Qu.:2004
Max. :2010
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
There was a curiously large number of remodels in 1950, about 80 remodels that year compared to the peak of about 50 in the 2000s. These were all built before 1950, and no house has an earlier remodel year than 1950, and some houses built before 1950 have remodel years later than 1950.
I’ll treat houses with a 1950 remodel as if they have not had a remodel; they may have had an earlier remodel, but I’m guessing that those older remodels are of little added value at this point. Ames assessors may have had reason to bottom-code, but I’ll set remodel year to built year in years prior to 1950 for now and see if it helps or hinders analysis and prediction.
val_train_Xy = val_train_Xy %>%
mutate(YearRemodAdd.uncode = ifelse(YearRemodAdd == 1950, YearBuilt, YearRemodAdd))
x = 'YearRemodAdd.uncode'
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
YearRemodAdd.uncode
Min. :1880
1st Qu.:1966
Median :1994
Mean :1982
3rd Qu.:2004
Max. :2010
# sum_and_trans_cont(
# data = val_train_Xy,
# x = x,
# func = best_normalizers[[x]]$best_func$func,
# func_name = best_normalizers[[x]]$best_func$name,
# x_binw = 1,
# t_binw = 1
# )
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = 1)
This only added outliers and slightly weakened the correlation to SalePrice.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('YearBuilt', 'YearRemodAdd', 'YearRemodAdd.uncode')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
YearBuilt YearRemodAdd YearRemodAdd.uncode
Min. :0.0009914 Min. :0.004039 Min. :0.001117
1st Qu.:0.0534053 1st Qu.:0.055532 1st Qu.:0.063792
Median :0.1616352 Median :0.152282 Median :0.145630
Mean :0.2421409 Mean :0.203658 Mean :0.198553
3rd Qu.:0.3684433 3rd Qu.:0.294762 3rd Qu.:0.268835
Max. :0.9624094 Max. :0.654782 Max. :0.657064
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
NULL
y_lst = colnames(select(val_train_Xy, where(is.factor)))
for (y in y_lst) {
plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plt)
}
val_train_Xy = select(val_train_Xy, -c('YearRemodAdd.uncode'))
Removing the records in which there is no remodel (i.e. YearRemodAdd == YearBuilt or 1950) adds clarity. 368 rows were removed for no remodel, and remodels really started being recorded in increasing numbers starting in the ’90s, maybe a little in the late ’80s.
ggplot(
val_train_Xy,
aes(x = ifelse(YearRemodAdd == YearBuilt, NA, YearRemodAdd)
)
) +
geom_histogram(binwidth = 1)
Because there are so many houses without remodels, I’ll split it into a factor, a level for each decade with the year 1950 conveniently lumped in with NAs.
x = 'YearRemodAdd.fact'
val_train_Xy = val_train_Xy %>%
mutate(
YearRemodAdd.fact = factor(
cut(
ifelse(
YearRemodAdd == YearBuilt | is.na(YearRemodAdd),
1949,
YearRemodAdd
),
breaks = c(1949, 1950, 1960, 1970, 1980, 1990, 2000, 2010),
labels = c('None', '50s', '60s', '70s', '80s', '90s', '00s')
)
)
) #%>%
# select(-c('YearRemodAdd'))
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "None" "00s" "90s"
Most are Gable (563), and many are Hip (139). Flat, Gambrel, and Mansard are neglible.
x = 'RoofStyle'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Gable" "Hip"
Vast majority are composition shingle (702). Can drop this feature.
summary(val_train_Xy$RoofMatl)
ClyTile CompShg Membran Metal Roll Tar&Grv WdShake WdShngl
1 705 1 0 0 4 1 3
val_train_Xy = select(val_train_Xy, -c('RoofMatl'))
Most popular class is vinyl (255 and 249), but wood, metal, and others represent significant classes. No ‘None’ in 2nd, so all houses in Ames have at least two types of siding?
x = 'Exterior1st'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "VinylSd" "MetalSd" "Wd Sdng" "HdBoard" "Plywood"
x = 'Exterior2nd'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "VinylSd" "MetalSd" "Wd Sdng" "HdBoard" "Plywood"
Most have none (430), but plenty of brick (219) and stone (66). No cinderblock in the split.
x = 'MasVnrType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "BrkFace" "None" "Stone"
x = 'MasVnrArea'
summary(val_train_Xy[x])
MasVnrArea
Min. : 0.0
1st Qu.: 0.0
Median : 0.0
Mean : 103.3
3rd Qu.: 166.5
Max. :1129.0
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$MasVnrArea != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 10,
t_binw = 0.25
)
NULL
x = 'cbrt(MasVnrArea)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(MasVnrArea)' = MasVnrArea^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
cbrt(MasVnrArea)
Min. : 0.000
1st Qu.: 0.000
Median : 0.000
Mean : 2.391
3rd Qu.: 5.501
Max. :10.413
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$MasVnrArea != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.25,
t_binw = 0.25
)
NULL
Because this variable’s 0s indicate a missing feature, we’re really concerned with Winsorizing non-zero values.
df = val_train_Xy[val_train_Xy$MasVnrArea != 0, ]
qqnorm(y = df$MasVnrArea, ylab = 'MasVnrArea')
qqline(y = df$MasVnrArea, ylab = 'MasVnrArea')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
qqnorm(y = sqrt(sqrt(df$MasVnrArea)), ylab = 'sqrt(sqrt(MasVnrArea))')
qqline(y = sqrt(sqrt(df$MasVnrArea)), ylab = 'sqrt(sqrt(MasVnrArea))')
Win_sqrt_x = Winsorize(
x = sqrt(sqrt(df$MasVnrArea)),
probs = c(0.005, 0.995),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win(sqrt(sqrt(MasVnrArea)))')
qqline(y = Win_sqrt_x, ylab = 'Win(sqrt(sqrt(MasVnrArea)))')
Win_cbrt_x = Winsorize(
x = df$`cbrt(MasVnrArea)`,
probs = c(0.005, 0.995),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = 'Win(cbrt(MasVnrArea))')
qqline(y = Win_cbrt_x, ylab = 'Win(cbrt(MasVnrArea))')
Win_raw_x = Winsorize(
x = df$MasVnrArea,
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win(MasVnrArea)')
qqline(y = Win_raw_x, ylab = 'Win(MasVnrArea)')
print(shapiro.test(df$MasVnrArea))
Shapiro-Wilk normality test
data: df$MasVnrArea
W = 0.86407, p-value = 3.522e-15
print(shapiro.test(df$'cbrt(MasVnrArea)'))
Shapiro-Wilk normality test
data: df$"cbrt(MasVnrArea)"
W = 0.99498, p-value = 0.4769
print(shapiro.test(sqrt(sqrt(df$MasVnrArea))))
Shapiro-Wilk normality test
data: sqrt(sqrt(df$MasVnrArea))
W = 0.99153, p-value = 0.09952
print(shapiro.test(Win_sqrt_x))
Shapiro-Wilk normality test
data: Win_sqrt_x
W = 0.99525, p-value = 0.5281
print(shapiro.test(Win_cbrt_x))
Shapiro-Wilk normality test
data: Win_cbrt_x
W = 0.99576, p-value = 0.6327
print(shapiro.test(Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.90384, p-value = 1.551e-12
min_val = min(Win_cbrt_x)
max_val = max(Win_cbrt_x)
val_train_Xy = val_train_Xy %>%
mutate(
'Win(cbrt(MasVnrArea))' = ifelse(
MasVnrArea == 0,
0,
Winsorize(
x = `cbrt(MasVnrArea)`,
# probs = c(0.005, 0.995),
minval = min_val,
maxval = max_val
)
)
)
x = 'Win(cbrt(MasVnrArea))'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('MasVnrArea', 'cbrt(MasVnrArea)', x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
MasVnrArea cbrt(MasVnrArea) Win(cbrt(MasVnrArea))
Min. :0.01480 Min. :0.00778 Min. :0.005013
1st Qu.:0.06893 1st Qu.:0.07272 1st Qu.:0.071175
Median :0.10170 Median :0.13331 Median :0.134064
Mean :0.14780 Mean :0.15174 Mean :0.151855
3rd Qu.:0.23594 3rd Qu.:0.23006 3rd Qu.:0.230854
Max. :0.38455 Max. :0.36920 Max. :0.372665
NA's :1 NA's :1 NA's :1
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
MasVnrArea cbrt(MasVnrArea) Win(cbrt(MasVnrArea))
Min. :0.0009883 Min. :0.007318 Min. :0.007091
1st Qu.:0.0727252 1st Qu.:0.068257 1st Qu.:0.068430
Median :0.1386111 Median :0.152985 Median :0.153200
Mean :0.1806451 Mean :0.187874 Mean :0.187854
3rd Qu.:0.3070987 3rd Qu.:0.314667 3rd Qu.:0.314633
Max. :0.4212846 Max. :0.430102 Max. :0.430096
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
This variable has a lot of zeros that indicate a missing feature that make it a candidate for binary encoding to aid linear regression. But, MasVnrType already encodes that in ‘None’. Because there’s a significant difference in price between MasVnrType levels, that should suffice. Also, leaving the 0s in improves the correlation to the target variable, so it may be moot.
# already hardcoded this one.
print(paste("min_val:", min_val))
[1] "min_val: 1.52019153849196"
print(paste("max_val:", max_val))
[1] "max_val: 10.278035603362"
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = .2)
y_lst = c('MasVnrType')
z = 'SalePrice.fact'
for (y in y_lst) {
plt = fenced_jbv(
data = val_train_Xy,
x = y,
y = 'cbrt(MasVnrArea)',
jit_col = z,
jit_alpha = 0.75,
leg_lb = z,
box_color = 'red'
) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
print(plt)
}
ggplot(val_train_Xy, aes(x = `cbrt(MasVnrArea)`, y = `log(SalePrice)`)) +
geom_point(alpha = 0.5) +
geom_smooth(method = 'lm') +
facet_wrap(vars(MasVnrType))
There are some houses with masonry footage but no type ascribed. I noticed this during the wrangling audit and decided to leave it for the modeling phase, maybe to impute the masonry type with caret using a select few variables as indicators or simply imputing the most common type (BrkFace), but maybe create a new level for those handful of undescribed masonry types.
Mostly average (440), many good (241).
x = 'ExterQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Gd" "TA"
Greater majority average (621), still some good (74). May be worth keeping.
x = 'ExterCond'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
character(0)
Evenly split between poured concrete and cinder block (317 and 302), but still 72 brick and tile.
x = 'Foundation'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "PConc" "BrkTil" "CBlock"
Could drop Stone and Wood and make it an ordered factor to represent as ints in regression or to one-hot.
Evenly split between average and good (313 and 303), but 63 excellent, and only 26 without basements. I’m having trouble imagining houses with basements and cinder block foundations.
x = 'BsmtQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Gd" "TA" "Ex"
Vast majority average (635).
x = 'BsmtCond'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
character(0)
Great majority (about 464) not exposed, but still some average (100), good (71), and minimal exposure (54).
x = 'BsmtExposure'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "No" "Av" "Mn" "Gd"
211 are unfinished, 205 are top quality, and descending counts to low quality (34).
x = 'BsmtFinType1'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "GLQ" "Unf" "BLQ" "ALQ" "LwQ" "Rec"
Probably just one-hot GLQ, LwQ, and None for regression.
x = 'BsmtFinSF1'
summary(val_train_Xy[x])
BsmtFinSF1
Min. : 0.0
1st Qu.: 0.0
Median : 375.0
Mean : 446.5
3rd Qu.: 699.0
Max. :5644.0
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$BsmtFinSF1 != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 0.25
)
NULL
x = 'cbrt(BsmtFinSF1)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(BsmtFinSF1)' = BsmtFinSF1^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
cbrt(BsmtFinSF1)
Min. : 0.000
1st Qu.: 0.000
Median : 7.211
Mean : 5.564
3rd Qu.: 8.875
Max. :17.804
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$BsmtFinSF1 != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.25,
t_binw = 0.25
)
NULL
ggplot(val_train_Xy, aes(x = .data[['cbrt(BsmtFinSF1)']])) +
geom_histogram()
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BsmtFinSF1', 'cbrt(BsmtFinSF1)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
BsmtFinSF1 cbrt(BsmtFinSF1)
Min. :0.006172 Min. :0.000255
1st Qu.:0.079152 1st Qu.:0.069259
Median :0.218996 Median :0.115415
Mean :0.220147 Mean :0.157082
3rd Qu.:0.318490 3rd Qu.:0.196450
Max. :0.644301 Max. :0.643687
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
NULL
plot_scat_pairs(df = val_train_Xy, x = 'BsmtFinSF1', y_lst = y_lst)
NULL
This will be dropped in favor of a derivative variable, total basement sf, so no need to go any further.
val_train_Xy = select(val_train_Xy, -c('cbrt(BsmtFinSF1)'))
Mostly unfinished (609) but 27 no basements. So, only one house in Ames with a basement doesn’t have a second basement?? This doesn’t seem right. It might be worth dropping this feature.
x = 'BsmtFinType2'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
character(0)
x = 'BsmtUnfSF'
summary(val_train_Xy[x])
BsmtUnfSF
Min. : 0.0
1st Qu.: 234.0
Median : 470.0
Mean : 564.6
3rd Qu.: 795.5
Max. :2046.0
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 0.25
)
NULL
x = 'cbrt(BsmtUnfSF)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(BsmtUnfSF)' = BsmtUnfSF^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
cbrt(BsmtUnfSF)
Min. : 0.000
1st Qu.: 6.162
Median : 7.775
Mean : 7.402
3rd Qu.: 9.266
Max. :12.695
sum_and_trans_cont(
data = filter(val_train_Xy, BsmtUnfSF != 0),
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.25,
t_binw = 0.25
)
NULL
Because this variable’s 0s indicate a missing basement, just Winsorizing non-zero values.
df = val_train_Xy[val_train_Xy$BsmtUnfSF != 0, ]
qqnorm(y = df$BsmtUnfSF, ylab = 'BsmtUnfSF')
qqline(y = df$BsmtUnfSF, ylab = 'BsmtUnfSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_cbrt_x = Winsorize(
x = df[[x]],
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = 'Win(cbrt(BsmtUnfSF)')
qqline(y = Win_cbrt_x, ylab = 'Win(cbrt(BsmtUnfSF)')
Win_raw_x = Winsorize(
x = df$BsmtUnfSF,
probs = c(0, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'BsmtUnfSF')
qqline(y = Win_raw_x, ylab = 'BsmtUnfSF')
print(shapiro.test(x = df$BsmtUnfSF))
Shapiro-Wilk normality test
data: df$BsmtUnfSF
W = 0.92795, p-value < 2.2e-16
print(shapiro.test(x = df$`cbrt(BsmtUnfSF)`))
Shapiro-Wilk normality test
data: df$`cbrt(BsmtUnfSF)`
W = 0.993, p-value = 0.003542
print(shapiro.test(x = Win_cbrt_x))
Shapiro-Wilk normality test
data: Win_cbrt_x
W = 0.96978, p-value = 2.054e-10
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.93316, p-value < 2.2e-16
x = 'cbrt(BsmtUnfSF)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BsmtUnfSF', 'cbrt(BsmtUnfSF)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
BsmtUnfSF cbrt(BsmtUnfSF)
Min. :0.007563 Min. :0.007861
1st Qu.:0.042330 1st Qu.:0.039367
Median :0.137369 Median :0.128333
Mean :0.135120 Mean :0.122909
3rd Qu.:0.180533 3rd Qu.:0.165133
Max. :0.473352 Max. :0.356293
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
BsmtUnfSF cbrt(BsmtUnfSF)
Min. :0.008363 Min. :0.001348
1st Qu.:0.055197 1st Qu.:0.045389
Median :0.101256 Median :0.078388
Mean :0.127480 Mean :0.108878
3rd Qu.:0.156947 3rd Qu.:0.141344
Max. :0.534817 Max. :0.539121
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'TotalBsmtSF'
summary(val_train_Xy[x])
TotalBsmtSF
Min. : 0.0
1st Qu.: 793.5
Median : 981.0
Mean :1058.1
3rd Qu.:1291.0
Max. :6110.0
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 1/20
)
NULL
x = 'log(TotalBsmtSF)'
val_train_Xy = val_train_Xy %>%
mutate(
'log(TotalBsmtSF)' = ifelse(
TotalBsmtSF <= 0,
0,
log(TotalBsmtSF)
)
)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
log(TotalBsmtSF)
Min. :0.000
1st Qu.:6.676
Median :6.889
Mean :6.734
3rd Qu.:7.163
Max. :8.718
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/20,
t_binw = 1
)
NULL
x = 'square(log(TotalBsmtSF))'
val_train_Xy = val_train_Xy %>%
mutate(
'square(log(TotalBsmtSF))' = ifelse(
TotalBsmtSF <= 0,
0,
log(TotalBsmtSF)^2
)
)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
square(log(TotalBsmtSF))
Min. : 0.00
1st Qu.:44.58
Median :47.45
Mean :46.77
3rd Qu.:51.31
Max. :76.00
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
Just checking non-zero set.
df = val_train_Xy[val_train_Xy$TotalBsmtSF != 0, ]
qqnorm(y = df$TotalBsmtSF, ylab = 'TotalBsmtSF')
qqline(y = df$TotalBsmtSF, ylab = 'TotalBsmtSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_log_x_squared = Winsorize(
x = df[[x]],
probs = c(0.005, 0.995),
na.rm = T
)
qqnorm(y = Win_log_x_squared, ylab = 'Win_log_x_squared')
qqline(y = Win_log_x_squared, ylab = 'Win_log_x_squared')
Win_raw_x = Winsorize(
x = df$TotalBsmtSF,
probs = c(0, 0.99),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = df$TotalBsmtSF))
Shapiro-Wilk normality test
data: df$TotalBsmtSF
W = 0.84661, p-value < 2.2e-16
print(shapiro.test(x= df$`square(log(TotalBsmtSF))`))
Shapiro-Wilk normality test
data: df$`square(log(TotalBsmtSF))`
W = 0.98181, p-value = 1.344e-07
print(shapiro.test(x = Win_log_x_squared))
Shapiro-Wilk normality test
data: Win_log_x_squared
W = 0.99018, p-value = 0.0001361
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.9591, p-value = 5.387e-13
x = 'Win(square(log(TotalBsmtSF)))'
val_train_Xy = val_train_Xy %>%
mutate(
'Win(square(log(TotalBsmtSF)))' = ifelse(
TotalBsmtSF <= 0,
0,
Winsorize(
x = log(TotalBsmtSF)^2,
probs = c(0.005, 0.995),
na.rm = T
)
)
) %>%
mutate(
'Win(log(TotalBsmtSF))' = ifelse(
TotalBsmtSF <= 0,
0,
Winsorize(
x = log(TotalBsmtSF),
probs = c(0.005, 0.995),
na.rm = T
)
)
)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotalBsmtSF', 'log(TotalBsmtSF)', 'square(log(TotalBsmtSF))',
'Win(square(log(TotalBsmtSF)))')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
TotalBsmtSF log(TotalBsmtSF) square(log(TotalBsmtSF))
Min. :0.001736 Min. :0.001862 Min. :0.004075
1st Qu.:0.111764 1st Qu.:0.067788 1st Qu.:0.076938
Median :0.352060 Median :0.155643 Median :0.226709
Mean :0.306997 Mean :0.178743 Mean :0.233972
3rd Qu.:0.404087 3rd Qu.:0.229931 3rd Qu.:0.300477
Max. :0.823514 Max. :0.999521 Max. :0.965497
Win(square(log(TotalBsmtSF)))
Min. :0.003976
1st Qu.:0.076883
Median :0.226443
Mean :0.233493
3rd Qu.:0.299516
Max. :0.966195
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
TotalBsmtSF log(TotalBsmtSF) square(log(TotalBsmtSF))
Min. :0.002517 Min. :0.006178 Min. :0.004722
1st Qu.:0.143176 1st Qu.:0.128517 1st Qu.:0.135745
Median :0.331959 Median :0.323649 Median :0.326473
Mean :0.315899 Mean :0.310116 Mean :0.313815
3rd Qu.:0.407319 3rd Qu.:0.414363 3rd Qu.:0.418140
Max. :0.903203 Max. :0.994652 Max. :0.990654
Win(square(log(TotalBsmtSF)))
Min. :0.005277
1st Qu.:0.136215
Median :0.327097
Mean :0.314560
3rd Qu.:0.420007
Max. :0.988335
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features') +
xlab(label = 'Subset with no 0s')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
Looks to be some clustering here, two or three groups. Maybe those with and without a second basement, maybe basement types. I think regression will suss out what information is needed, but it’s worth noting.
The raw feature actually correlates much better with the target variable when 0s are included. It might be worth keeping the raw features for the Lasso regression to weed through. Normalizing apart from 0s may or may not help other regressions; it would be ideal for some kind of combo regression that splits/clusters then finds a linear regression, which is sort of what I’m attempting by feeding the binary version of these features to Lasso (where not already encoded by a factor). Normalizing apart for 0s is not likely to help decision trees much, but may help KNN a little by pulling 0s farther away and outliers closer.
x = 'Win(square(log(TotalBsmtSF)))'
min_val = min(Win_log_x_squared)
max_val = max(Win_log_x_squared)
print(paste("min_val:", min_val))
[1] "min_val: 32.6597927030784"
print(paste("max_val:", max_val))
[1] "max_val: 60.3486636755867"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(square(log(TotalBsmtSF)))' = ifelse(
TotalBsmtSF == 0,
0,
Winsorize(
log(TotalBsmtSF)^2,
# probs = c(0.005, 0.995),
minval = min_val,
maxval = max_val
)
)
) %>%
select(-c('Win(log(TotalBsmtSF))', 'log(TotalBsmtSF)'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
I can create a binary for basement when I one-hot encode during modeling, but I want to be able to use it for exploration here.
val_train_Xy = val_train_Xy %>%
mutate('Bsmt.bin' = factor(ifelse(TotalBsmtSF == 0, 0, 1), ordered = T))
x = 'Bsmt.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 20)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "1" "0"
val_train_Xy = select(val_train_Xy, -c('Bsmt.bin'))
TotalBsmtSF - BsmtUnfSF
x = 'TotalBsmtFinSF'
val_train_Xy = val_train_Xy %>%
mutate('TotalBsmtFinSF' = TotalBsmtSF - BsmtUnfSF)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
TotalBsmtFinSF
Min. : 0.0
1st Qu.: 0.0
Median : 456.0
Mean : 493.6
3rd Qu.: 786.0
Max. :5644.0
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 1
)
NULL
x = 'sqrt(TotalBsmtFinSF)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(TotalBsmtFinSF)' = sqrt(TotalBsmtFinSF))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
sqrt(TotalBsmtFinSF)
Min. : 0.00
1st Qu.: 0.00
Median :21.35
Mean :17.39
3rd Qu.:28.04
Max. :75.13
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
Just the non-zero set.
x = 'sqrt(TotalBsmtFinSF)'
df = val_train_Xy[val_train_Xy$TotalBsmtFinSF != 0, ]
qqnorm(y = df$TotalBsmtFinSF, ylab = 'TotalBsmtFinSF')
qqline(y = df$TotalBsmtFinSF, ylab = 'TotalBsmtFinSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_sqrt_x = Winsorize(
x = df[[x]],
probs = c(0.03, 0.995),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
Win_raw_x = Winsorize(
x = df$TotalBsmtFinSF,
probs = c(0.001, 0.99),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win(TotalBsmtFinSF)')
qqline(y = Win_raw_x, ylab = 'Win(TotalBsmtFinSF)')
print(shapiro.test(x = df$TotalBsmtFinSF))
Shapiro-Wilk normality test
data: df$TotalBsmtFinSF
W = 0.83537, p-value < 2.2e-16
print(shapiro.test(x = df$`sqrt(TotalBsmtFinSF)`))
Shapiro-Wilk normality test
data: df$`sqrt(TotalBsmtFinSF)`
W = 0.97064, p-value = 3.291e-08
print(shapiro.test(x = Win_sqrt_x))
Shapiro-Wilk normality test
data: Win_sqrt_x
W = 0.99214, p-value = 0.01256
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.97166, p-value = 5.265e-08
val_train_Xy = val_train_Xy %>%
mutate(
'Win(sqrt(TotalBsmtFinSF))' = ifelse(
TotalBsmtFinSF == 0,
0,
Winsorize(
x = sqrt(TotalBsmtFinSF),
# probs = c(0.01, 0.995),
minval = min(Win_sqrt_x),
maxval = max(Win_sqrt_x)
)
)
) %>%
mutate(
'Win(TotalBsmtFinSF)' = ifelse(
TotalBsmtFinSF == 0,
0,
Winsorize(
x = TotalBsmtFinSF,
# probs = c(0.001, 0.99)
minval = min(Win_raw_x),
maxval = max(Win_raw_x)
)
)
)
x = 'Win(sqrt(TotalBsmtFinSF))'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotalBsmtFinSF', 'sqrt(TotalBsmtFinSF)', x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
TotalBsmtFinSF sqrt(TotalBsmtFinSF) Win(sqrt(TotalBsmtFinSF))
Min. :0.0004767 Min. :0.007371 Min. :0.008608
1st Qu.:0.0941765 1st Qu.:0.093248 1st Qu.:0.098192
Median :0.2387013 Median :0.166897 Median :0.155155
Mean :0.2619773 Mean :0.222351 Mean :0.218137
3rd Qu.:0.3211208 3rd Qu.:0.277837 3rd Qu.:0.277832
Max. :0.9567699 Max. :0.957414 Max. :0.955968
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
TotalBsmtFinSF sqrt(TotalBsmtFinSF) Win(sqrt(TotalBsmtFinSF))
Min. :0.002897 Min. :0.0002521 Min. :0.0187
1st Qu.:0.112971 1st Qu.:0.1000694 1st Qu.:0.1153
Median :0.274222 Median :0.2348650 Median :0.2413
Mean :0.295891 Mean :0.2677646 Mean :0.2713
3rd Qu.:0.388785 3rd Qu.:0.3526043 3rd Qu.:0.3546
Max. :0.917361 Max. :0.9644098 Max. :0.9887
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(sqrt(TotalBsmtFinSF))'
min_val = min(Win_sqrt_x)
max_val = max(Win_sqrt_x)
print(paste("min_val:", min_val))
[1] "min_val: 10.1234508028247"
print(paste("max_val:", max_val))
[1] "max_val: 46.3884140769296"
# Already hard coded above.
val_train_Xy = select(val_train_Xy, -c('Win(TotalBsmtFinSF)'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
Mostly gas forced air (GasA, 697). Can probably drop this.
summary(val_train_Xy$Heating)
None Floor GasA GasW Grav OthW Wall
0 0 702 7 2 1 3
val_train_Xy = select(val_train_Xy, -c('Heating'))
Mostly excellent (360), then interestingly more average (211) than good (117). Enough variance to keep, but not normal.
x = 'HeatingQC'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Ex" "TA" "Gd"
662 yes, 53 no. Probably drop this one.
summary(val_train_Xy$CentralAir)
N Y
39 676
val_train_Xy = select(val_train_Xy, -c('CentralAir'))
655 standard circuit breaker and Romex. Probably drop, but there are none mixed, so it might be worth making it an ordered factor if keeping.
summary(val_train_Xy$Electrical)
None SBrkr FuseA FuseF FuseP Mix
0 650 49 14 1 1
val_train_Xy = select(val_train_Xy, -c('Electrical'))
Dropping as component of GrLivArea.
summary(val_train_Xy$X1stFlrSF)
Min. 1st Qu. Median Mean 3rd Qu. Max.
334 881 1086 1167 1392 4692
val_train_Xy = select(val_train_Xy, -c('X1stFlrSF'))
Dropping as component of GrLivArea. The presence of a second floor is encoded in HouseStyle, but it’s also convoluted between a few levels. I’ll make a binary out of it and keep that.
val_train_Xy = val_train_Xy %>%
mutate('X2ndFlr.bin' = ifelse(X2ndFlrSF <= 0, 0, 1)) %>%
mutate('X2ndFlr.bin.fact' = factor(X2ndFlr.bin, ordered = T))
x = 'X2ndFlr.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
x = 'X2ndFlr.bin.fact'
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "1" "0"
val_train_Xy = select(val_train_Xy, -c('X2ndFlr.bin.fact'))
702 0s. Drop this feature, though it would be a useful modifier of GrLivArea as a detracting component of it.
summary(val_train_Xy$LowQualFinSF)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 4.846 0.000 528.000
val_train_Xy = select(val_train_Xy, -c('LowQualFinSF'))
As this is polymodal to the number of floors, I’ll start by faceting before transforming.
ggplot(val_train_Xy, aes(x = GrLivArea)) +
geom_histogram(binwidth = 50) +
facet_wrap(facets = vars(X2ndFlr.bin), ncol = 1)
x = 'GrLivArea'
summary(val_train_Xy[x])
GrLivArea
Min. : 334
1st Qu.:1129
Median :1456
Mean :1515
3rd Qu.:1791
Max. :5642
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = .1
)
NULL
x = 'log2(GrLivArea)'
val_train_Xy = val_train_Xy %>%
mutate('log2(GrLivArea)' = log2(GrLivArea))
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = .1) +
facet_wrap(facets = vars(X2ndFlr.bin), ncol = 1)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
log2(GrLivArea)
Min. : 8.384
1st Qu.:10.141
Median :10.508
Mean :10.486
3rd Qu.:10.807
Max. :12.462
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = 1
)
NULL
x = 'square(log2(GrLivArea))'
val_train_Xy = val_train_Xy %>%
mutate('square(log2(GrLivArea))' = log2(GrLivArea)^2)
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = 1) +
facet_wrap(facets = vars(X2ndFlr.bin), ncol = 1)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
square(log2(GrLivArea))
Min. : 70.29
1st Qu.:102.84
Median :110.41
Mean :110.18
3rd Qu.:116.78
Max. :155.30
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
ggplot(val_train_Xy, aes(x = `square(log2(GrLivArea))`)) +
geom_histogram(binwidth = 1) +
facet_wrap(facets = vars(.data$X2ndFlr.bin), ncol = 1)
ggplot(val_train_Xy, aes(x = `square(log2(GrLivArea))`)) +
geom_histogram(binwidth = 1) +
facet_grid(
cols = vars(.data$X2ndFlr.bin),
rows = vars(.data$YearBuilt.fact)
)
It looks like there might be more polymodality, particularly among mid-century builds, but I’ll leave it here. The transformations seem to have better normalized both the full variable and its subsets by storey and age. Winsorization should help both too.
x = 'square(log2(GrLivArea))'
qqnorm(y = val_train_Xy$GrLivArea, ylab = 'GrLivArea')
qqline(y = val_train_Xy$GrLivArea, ylab = 'GrLivArea')
qqnorm(y = val_train_Xy$`log2(GrLivArea)`, ylab = 'log2(GrLivArea)')
qqline(y = val_train_Xy$`log2(GrLivArea)`, ylab = 'log2(GrLivArea)')
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_log2_x_squared = Winsorize(
x = val_train_Xy[[x]],
probs = c(0.002, 0.998),
na.rm = T
)
qqnorm(y = Win_log2_x_squared, ylab = 'Win_log2_x_squared')
qqline(y = Win_log2_x_squared, ylab = 'Win_log2_x_squared')
Win_log2_x = Winsorize(
x = val_train_Xy$`log2(GrLivArea)`,
probs = c(0.002, 0.998),
na.rm = T
)
qqnorm(y = Win_log2_x, ylab = 'Win_log2_x')
qqline(y = Win_log2_x, ylab = 'Win_log2_x')
Win_raw_x = Winsorize(
x = val_train_Xy$GrLivArea,
probs = c(0, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$GrLivArea))
Shapiro-Wilk normality test
data: val_train_Xy$GrLivArea
W = 0.92124, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log2(GrLivArea)`))
Shapiro-Wilk normality test
data: val_train_Xy$`log2(GrLivArea)`
W = 0.99301, p-value = 0.002004
print(shapiro.test(x = val_train_Xy[['square(log2(GrLivArea))']]))
Shapiro-Wilk normality test
data: val_train_Xy[["square(log2(GrLivArea))"]]
W = 0.99284, p-value = 0.001655
print(shapiro.test(x = Win_log2_x_squared))
Shapiro-Wilk normality test
data: Win_log2_x_squared
W = 0.99603, p-value = 0.06738
print(shapiro.test(x = Win_log2_x))
Shapiro-Wilk normality test
data: Win_log2_x
W = 0.99606, p-value = 0.06957
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.97112, p-value = 1.136e-10
Winsorizing the log2 better normalized than squaring it, but Winsorizing the squared log2 is close, too. May be overfit to add the square.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(log2(GrLivArea))' = Winsorize(
ifelse(GrLivArea <= 0, 0, log2(GrLivArea)),
probs = c(0.002, 0.998),
na.rm = T
)
) %>%
mutate(
'Win(square(log2(GrLivArea)))' = Winsorize(
ifelse(GrLivArea <= 0, 0, log2(GrLivArea)^2),
probs = c(0.002, 0.998),
na.rm = T
)
)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('GrLivArea', 'log2(GrLivArea)', 'square(log2(GrLivArea))', 'Win(log2(GrLivArea))', 'Win(square(log2(GrLivArea)))')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
GrLivArea log2(GrLivArea) square(log2(GrLivArea))
Min. :0.004074 Min. :0.007031 Min. :0.005632
1st Qu.:0.170723 1st Qu.:0.138405 1st Qu.:0.144044
Median :0.300589 Median :0.308047 Median :0.308343
Mean :0.320766 Mean :0.319385 Mean :0.320768
3rd Qu.:0.444557 3rd Qu.:0.439274 3rd Qu.:0.442560
Max. :0.829900 Max. :0.826904 Max. :0.831350
Win(log2(GrLivArea)) Win(square(log2(GrLivArea)))
Min. :0.007115 Min. :0.005648
1st Qu.:0.127613 1st Qu.:0.128787
Median :0.306231 Median :0.304675
Mean :0.317085 Mean :0.318136
3rd Qu.:0.428208 3rd Qu.:0.429922
Max. :0.827504 Max. :0.831948
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
The Winsorized double transformation may be slightly overfitting and overcomputing, but it’s slightly (likely insignificantly) more normal and correlated to SalePrice than the Winsorized single transformation. I’ll go with it despite common sense, just for fun.
x = 'Win(square(log2(GrLivArea)))'
min_val = min(Win_log2_x_squared)
max_val = max(Win_log2_x_squared)
print(paste("min_val:", min_val))
[1] "min_val: 78.8827556705776"
print(paste("max_val:", max_val))
[1] "max_val: 143.236548514549"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(square(log2(GrLivArea)))' = Winsorize(
ifelse(GrLivArea <= 0, 0, log2(GrLivArea)^2),
# probs = c(0.002, 0.998),
# na.rm = T
minval = min_val,
maxval = max_val
)
) %>%
select(-c('log2(GrLivArea)', 'Win(log2(GrLivArea))'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
I could make two features, total baths above grade and total basement baths, but I’ll keep it simple.
val_train_Xy = val_train_Xy %>%
mutate(TotBaths = FullBath + BsmtFullBath + 0.5*HalfBath + 0.5*BsmtHalfBath)
x = 'TotBaths'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
ggplot(val_train_Xy, aes(x = factor(TotBaths), y = log(SalePrice))
) +
geom_jitter(alpha = 0.5, aes(color = factor(BsmtFullBath + BsmtHalfBath))) +
geom_boxplot(
notch = T,
notchwidth = .1,
varwidth = T,
alpha = 0,
color = 'blue'
) +
geom_violin(alpha = 0) +
geom_line(
stat = 'summary',
fun = quantile,
fun.args = list(probs = .9),
linetype = 2, aes(group = 1)
) +
geom_line(stat = 'summary', fun = mean, mapping = aes(group = 1)) +
geom_line(
stat = 'summary',
fun = quantile,
fun.args = list(probs = .1),
linetype = 2, aes(group = 1)
) +
xlab(label = 'TotBaths') + ylab(label = 'log(SalePrice)')
Basement bathrooms don’t seem to add much value when there aren’t many bathrooms altogether.
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "3.5" "2" "3" "1" "1.5" "2.5"
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
TotBaths
Min. :1.000
1st Qu.:2.000
Median :2.000
Mean :2.217
3rd Qu.:2.500
Max. :6.000
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.5,
t_binw = 0.1
)
NULL
x = 'sqrt(TotBaths)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(TotBaths)' = sqrt(TotBaths))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
sqrt(TotBaths)
Min. :1.000
1st Qu.:1.414
Median :1.414
Mean :1.464
3rd Qu.:1.581
Max. :2.449
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = .1
)
NULL
x = 'sqrt(TotBaths)'
qqnorm(y = val_train_Xy$TotBaths, ylab = 'TotBaths')
qqline(y = val_train_Xy$TotBaths, ylab = 'TotBaths')
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_sqrt_x = Winsorize(
x = val_train_Xy[[x]],
probs = c(0.0, 0.999),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
Win_raw_x = Winsorize(
x = val_train_Xy$TotBaths,
probs = c(0, 0.998),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$TotBaths))
Shapiro-Wilk normality test
data: val_train_Xy$TotBaths
W = 0.92874, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`sqrt(TotBaths)`))
Shapiro-Wilk normality test
data: val_train_Xy$`sqrt(TotBaths)`
W = 0.9241, p-value < 2.2e-16
print(shapiro.test(x = Win_sqrt_x))
Shapiro-Wilk normality test
data: Win_sqrt_x
W = 0.92391, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.93136, p-value < 2.2e-16
Maybe just lightly top-coding the raw variable is best.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(sqrt(TotBaths))' = Winsorize(
sqrt(TotBaths),
probs = c(0, 0.999),
na.rm = T
)
) %>%
mutate(
'Win(TotBaths)' = Winsorize(
TotBaths,
probs = c(0, 0.998),
na.rm = T
)
)
x = 'Win(TotBaths)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotBaths', 'sqrt(TotBaths)', 'Win(sqrt(TotBaths))', 'Win(TotBaths)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
TotBaths sqrt(TotBaths) Win(sqrt(TotBaths))
Min. :0.004047 Min. :0.001146 Min. :0.0004721
1st Qu.:0.157148 1st Qu.:0.155984 1st Qu.:0.1568243
Median :0.329253 Median :0.331356 Median :0.3319365
Mean :0.329428 Mean :0.328623 Mean :0.3294207
3rd Qu.:0.487119 3rd Qu.:0.483214 3rd Qu.:0.4844607
Max. :0.687363 Max. :0.687107 Max. :0.6868680
Win(TotBaths)
Min. :0.002175
1st Qu.:0.160011
Median :0.331592
Mean :0.332448
3rd Qu.:0.493532
Max. :0.688339
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(TotBaths)'
min_val = min(Win_raw_x)
max_val = max(Win_raw_x)
print(paste("min_val:", min_val))
[1] "min_val: 1"
print(paste("max_val:", max_val))
[1] "max_val: 4.786"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(TotBaths)' = Winsorize(
FullBath + BsmtFullBath + 0.5*HalfBath + 0.5*BsmtHalfBath,
# probs = c(0.002, 0.998),
# na.rm = T
minval = min_val,
maxval = max_val
)
) %>%
select(-c('sqrt(TotBaths)', 'Win(sqrt(TotBaths))'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 0.5)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
x = 'BedroomAbvGr'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
df = select(val_train_Xy, c('log(SalePrice)', 'BedroomAbvGr'))
df$BedroomAbvGr.fact = factor(df$BedroomAbvGr)
sum_and_trans_fact(data = df, x = 'BedroomAbvGr.fact', y = y)
NULL
p_vals = get_signif_levels(
data = df,
x = 'BedroomAbvGr.fact',
z = y,
min_n = 30
)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "3" "2" "4"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BedroomAbvGr')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
BedroomAbvGr
Min. :0.004635
1st Qu.:0.041301
Median :0.090023
Mean :0.151626
3rd Qu.:0.209621
Max. :0.654507
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.5,
t_binw = 0.1
)
NULL
x = 'sqrt(BedroomAbvGr)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(BedroomAbvGr)' = sqrt(BedroomAbvGr))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
sqrt(BedroomAbvGr)
Min. :1.000
1st Qu.:1.414
Median :1.732
Mean :1.682
3rd Qu.:1.732
Max. :2.828
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = .1
)
NULL
x = 'sqrt(BedroomAbvGr)'
qqnorm(y = val_train_Xy$BedroomAbvGr, ylab = 'BedroomAbvGr')
qqline(y = val_train_Xy$BedroomAbvGr, ylab = 'BedroomAbvGr')
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_sqrt_x = Winsorize(
x = val_train_Xy[[x]],
probs = c(0, 0.999),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
Win_raw_x = Winsorize(
x = val_train_Xy$BedroomAbvGr,
probs = c(0, 0.995),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$BedroomAbvGr))
Shapiro-Wilk normality test
data: val_train_Xy$BedroomAbvGr
W = 0.8457, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`sqrt(BedroomAbvGr)`))
Shapiro-Wilk normality test
data: val_train_Xy$`sqrt(BedroomAbvGr)`
W = 0.84731, p-value < 2.2e-16
print(shapiro.test(x = Win_sqrt_x))
Shapiro-Wilk normality test
data: Win_sqrt_x
W = 0.8486, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.8574, p-value < 2.2e-16
Maybe just lightly Winsorizing the raw variable is best.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(sqrt(BedroomAbvGr))' = Winsorize(
sqrt(BedroomAbvGr),
probs = c(0, 0.999),
na.rm = T
)
) %>%
mutate(
'Win(BedroomAbvGr)' = Winsorize(
BedroomAbvGr,
probs = c(0, 0.995),
na.rm = T
)
)
x = 'Win(BedroomAbvGr)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BedroomAbvGr', 'sqrt(BedroomAbvGr)', 'Win(sqrt(BedroomAbvGr))', 'Win(BedroomAbvGr)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
BedroomAbvGr sqrt(BedroomAbvGr) Win(sqrt(BedroomAbvGr))
Min. :0.004635 Min. :0.001827 Min. :0.002047
1st Qu.:0.041301 1st Qu.:0.044497 1st Qu.:0.044839
Median :0.090023 Median :0.094369 Median :0.094250
Mean :0.151626 Mean :0.150984 Mean :0.151281
3rd Qu.:0.209621 3rd Qu.:0.216792 3rd Qu.:0.217943
Max. :0.654507 Max. :0.638464 Max. :0.635445
Win(BedroomAbvGr)
Min. :0.003767
1st Qu.:0.041751
Median :0.091144
Mean :0.154296
3rd Qu.:0.219703
Max. :0.649031
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(BedroomAbvGr)'
min_val = min(Win_raw_x)
max_val = max(Win_raw_x)
print(paste("min_val:", min_val))
[1] "min_val: 1"
print(paste("max_val:", max_val))
[1] "max_val: 5.42999999999995"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(BedroomAbvGr)' = Winsorize(
BedroomAbvGr,
# probs = c(0.002, 0.998),
# na.rm = T
minval = min_val,
maxval = max_val
)
) %>%
select(-c('sqrt(BedroomAbvGr)', 'Win(sqrt(BedroomAbvGr))'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
summary(val_train_Xy$KitchenAbvGr)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 1.000 1.000 1.041 1.000 3.000
val_train_Xy = select(val_train_Xy, -c('KitchenAbvGr'))
x = 'KitchenQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Gd" "TA" "Ex"
x = 'TotRmsAbvGrd'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
df = select(val_train_Xy, c('log(SalePrice)', 'TotRmsAbvGrd'))
df$TotRmsAbvGrd.fact = factor(df$TotRmsAbvGrd)
sum_and_trans_fact(data = df, x = 'TotRmsAbvGrd.fact', y = y)
NULL
p_vals = get_signif_levels(
data = df,
x = 'TotRmsAbvGrd.fact',
z = y,
min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "8" "5" "6" "4" "7" "9"
summary(val_train_Xy[x])
TotRmsAbvGrd
Min. : 2.000
1st Qu.: 5.000
Median : 6.000
Mean : 6.543
3rd Qu.: 7.000
Max. :14.000
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotRmsAbvGrd')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
TotRmsAbvGrd
Min. :0.00436
1st Qu.:0.10320
Median :0.22718
Mean :0.28644
3rd Qu.:0.40813
Max. :0.83195
x = 'TotRmsAbvGrd'
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_raw_x = Winsorize(
x = val_train_Xy$TotRmsAbvGrd,
probs = c(0, 0.975),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win(TotRmsAbvGrd)')
qqline(y = Win_raw_x, ylab = 'Win(TotRmsAbvGrd)')
print(shapiro.test(x = val_train_Xy$TotRmsAbvGrd))
Shapiro-Wilk normality test
data: val_train_Xy$TotRmsAbvGrd
W = 0.93616, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.94556, p-value = 1.508e-15
val_train_Xy = val_train_Xy %>%
mutate(
'Win(TotRmsAbvGrd)' = Winsorize(
TotRmsAbvGrd,
probs = c(0, 0.975),
na.rm = T
)
)
x = 'Win(TotRmsAbvGrd)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotRmsAbvGrd', 'Win(TotRmsAbvGrd)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
TotRmsAbvGrd Win(TotRmsAbvGrd)
Min. :0.00436 Min. :0.006598
1st Qu.:0.10320 1st Qu.:0.092855
Median :0.22718 Median :0.224392
Mean :0.28644 Mean :0.286050
3rd Qu.:0.40813 3rd Qu.:0.414866
Max. :0.83195 Max. :0.833412
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(TotRmsAbvGrd)'
min_val = min(Win_raw_x)
max_val = max(Win_raw_x)
print(paste("min_val:", min_val))
[1] "min_val: 2"
print(paste("max_val:", max_val))
[1] "max_val: 10.15"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(TotRmsAbvGrd)' = Winsorize(
TotRmsAbvGrd,
# probs = c(0, 0.975),
# na.rm = T
minval = min_val,
maxval = max_val
)
)
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
x = 'Functional'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
character(0)
x = 'Fireplaces'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
val_train_Xy$Fireplaces.fact = factor(val_train_Xy$Fireplaces, ordered = T)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = 'Fireplaces.fact', y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = 'Fireplaces.fact', z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "0" "2" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('Fireplaces')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
Fireplaces
Min. :0.0001063
1st Qu.:0.1125161
Median :0.2436102
Mean :0.2242305
3rd Qu.:0.3210328
Max. :0.4867665
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = c(y))
NULL
The variable needs more than three unique values in order to use my function to check for transformations, and 0 can’t be one of them in order to include log transformations. So, I’ll use Fireplaces + 1 to search for transformations.
val_train_Xy$Fireplaces_plus1 = val_train_Xy$Fireplaces + 1
x = 'Fireplaces_plus1'
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
x = 'Fireplaces'
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_raw_x = Winsorize(
x = val_train_Xy$Fireplaces,
probs = c(0, 0.999),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$Fireplaces))
Shapiro-Wilk normality test
data: val_train_Xy$Fireplaces
W = 0.76773, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.76773, p-value < 2.2e-16
Winsorization doesn’t help.
There’s no apparent meaningful difference in price between houses with 1 fireplace and those with 2 or 3. It’s significant (p < 0.1), but not apparently meaningful given the overlapping notches in the boxplot above, though I did not take Cohen’s d for effect size.
Binarization may condense the feature space without a lot of information loss. In fact, it improves the correlation to the target variable.
val_train_Xy = val_train_Xy %>%
mutate(Fireplaces.bin = ifelse(Fireplaces == 0, 0, 1))
x = 'Fireplaces.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
val_train_Xy$Fireplaces.bin.fact = factor(val_train_Xy$Fireplaces.bin, ordered = T)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = 'Fireplaces.bin.fact', y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = 'Fireplaces.bin.fact', z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "0" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('Fireplaces', 'Fireplaces.bin')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
Fireplaces Fireplaces.bin
Min. :0.0001063 Min. :0.002832
1st Qu.:0.1163174 1st Qu.:0.138881
Median :0.2436660 Median :0.220999
Mean :0.2380836 Mean :0.237974
3rd Qu.:0.3215369 3rd Qu.:0.299826
Max. :1.0000000 Max. :0.886894
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
val_train_Xy = select(
val_train_Xy,
-c('Fireplaces.fact', 'Fireplaces_plus1', 'Fireplaces.bin.fact',
'Fireplaces')
)
x = 'FireplaceQu'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "None" "TA" "Gd"
One-hot encoding this FireplaceQu will include a binarized version (“None”:int[0,1]), so we can drop Fireplace.bin altogether, and Fireplace because it just adds noise. I’ll take care of that during modeling with caret.
x = 'GarageType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Attchd" "Detchd" "None" "BuiltIn"
Similar distribution as year built for house. May not add additional information, but may be useful in interaction with type and finish. Could drop and use YearBuilt as proxy but would lose some info.
x = 'GarageYrBlt'
summary(val_train_Xy[x])
GarageYrBlt
Min. :1908
1st Qu.:1961
Median :1979
Mean :1978
3rd Qu.:2002
Max. :2010
NA's :40
# sum_and_trans_cont(
# data = val_train_Xy,
# x = x,
# func = best_normalizers[[x]]$best_func$func,
# func_name = best_normalizers[[x]]$best_func$name,
# x_binw = 1,
# t_binw = 1
# )
gg = ggplot(val_train_Xy, aes(x = GarageYrBlt))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
x = 'GarageYrBlt'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
GarageYrBlt
Min. :0.006671
1st Qu.:0.057040
Median :0.177159
Mean :0.235013
3rd Qu.:0.313315
Max. :0.847412
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
NULL
Even “controlling” for the year the house was built, GarageYrBlt doesn’t predict sale prices well, as seen in the next few plots. I’ll just drop it.
val_train_Xy = val_train_Xy %>%
mutate(
'GarBltLater' = factor(ifelse((GarageYrBlt - YearBuilt) <= 0, 0, 1))
)
ggplot(val_train_Xy, aes(x = GarageYrBlt, y = YearBuilt)) +
geom_jitter(
alpha = 0.5,
aes(
color = SalePrice.fact
),
position = position_jitter(w = 1, h = 1)
)
fbv = fenced_jbv(
data = val_train_Xy,
x = 'GarBltLater',
y = 'log(SalePrice)'
)
fbv
fbv +
facet_wrap(facets = vars(YearBuilt.fact))
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `sqrt(Age)`,
val_train_Xy
)$residuals
df = filter(
val_train_Xy,
GarageYrBlt - YearBuilt != 0
)
ggplot(df, aes(x = GarageYrBlt, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = 'log(SalePrice) ~ YearBuilt Residuals')
print(
paste(
"Correlation of GarageYRBlt to Age residual:",
cor(df$GarageYrBlt, df$resids)
)
)
[1] "Correlation of GarageYRBlt to Age residual: 0.0701692753019878"
val_train_Xy = select(val_train_Xy, -c('GarageYrBlt', 'GarBltLater'))
x = 'GarageFinish'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "RFn" "Unf" "None" "Fin"
x = 'GarageCars'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
val_train_Xy$GarageCars.fact = factor(
val_train_Xy$GarageCars,
ordered = T
)
sum_and_trans_fact(data = val_train_Xy, x = 'GarageCars.fact', y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = 'GarageCars.fact', z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "2" "1" "3" "0"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
GarageCars
Min. :0.005684
1st Qu.:0.162049
Median :0.279956
Mean :0.293729
3rd Qu.:0.408933
Max. :0.870258
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = c(y))
NULL
val_train_Xy$GarageCars.plus1 = val_train_Xy$GarageCars + 1
x = 'GarageCars.plus1'
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
x = 'GarageCars'
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
val_train_Xy$`Win(GarageCars)` = Winsorize(
x = val_train_Xy[[x]],
probs = c(0, 0.999),
na.rm = T
)
qqnorm(y = val_train_Xy$`Win(GarageCars)`, ylab = 'Win_raw_x')
qqline(y = val_train_Xy$`Win(GarageCars)`, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy[[x]]))
Shapiro-Wilk normality test
data: val_train_Xy[[x]]
W = 0.83694, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`Win(GarageCars)`))
Shapiro-Wilk normality test
data: val_train_Xy$`Win(GarageCars)`
W = 0.83446, p-value < 2.2e-16
val_train_Xy = val_train_Xy %>%
mutate(GarageCars.bin = ifelse(GarageCars == 0, 0, 1))
x = 'GarageCars.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
val_train_Xy$GarageCars.bin.fact =
factor(val_train_Xy$GarageCars.bin, ordered = T)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = 'GarageCars.bin.fact', y = y)
NULL
p_vals = get_signif_levels(
data = val_train_Xy, x = 'GarageCars.bin.fact', z = y, min_n = 30
)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "1" "0"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('GarageCars', 'Win(GarageCars)', 'GarageCars.bin')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
GarageCars Win(GarageCars) GarageCars.bin
Min. :0.005684 Min. :0.005378 Min. :0.005967
1st Qu.:0.163144 1st Qu.:0.163484 1st Qu.:0.051718
Median :0.282048 Median :0.283037 Median :0.090024
Mean :0.306341 Mean :0.307958 Mean :0.118027
3rd Qu.:0.419101 3rd Qu.:0.421339 3rd Qu.:0.148698
Max. :1.000000 Max. :0.999361 Max. :0.568588
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
That didn’t seem to help.
Polymodal in interaction with GarageCars.
x = 'GarageArea'
summary(val_train_Xy[x])
GarageArea
Min. : 0.0
1st Qu.: 312.0
Median : 480.0
Mean : 469.3
3rd Qu.: 576.0
Max. :1418.0
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 1
)
NULL
x = 'sqrt(GarageArea)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(GarageArea)' = sqrt(GarageArea))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
sqrt(GarageArea)
Min. : 0.00
1st Qu.:17.66
Median :21.91
Mean :20.68
3rd Qu.:24.00
Max. :37.66
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
NULL
x = 'sqrt(GarageArea)'
df = filter(val_train_Xy, GarageArea != 0)
qqnorm(y = df$GarageArea, ylab = 'GarageArea')
qqline(y = df$GarageArea, ylab = 'GarageArea')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_sqrt_x = Winsorize(
x = df[[x]],
probs = c(0, 0.998),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
Win_raw_x = Winsorize(
x = df$GarageArea,
probs = c(0, 0.987),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$GarageArea))
Shapiro-Wilk normality test
data: val_train_Xy$GarageArea
W = 0.97888, p-value = 1.218e-08
print(shapiro.test(x = val_train_Xy$`sqrt(GarageArea)`))
Shapiro-Wilk normality test
data: val_train_Xy$`sqrt(GarageArea)`
W = 0.85021, p-value < 2.2e-16
print(shapiro.test(x = Win_sqrt_x))
Shapiro-Wilk normality test
data: Win_sqrt_x
W = 0.98761, p-value = 1.749e-05
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.97029, p-value = 1.824e-10
min_val = min(Win_sqrt_x)
max_val = max(Win_sqrt_x)
val_train_Xy = val_train_Xy %>%
mutate(
'Win(sqrt(GarageArea))' = ifelse(
GarageArea == 0,
0,
Winsorize(
sqrt(GarageArea),
minval = min_val,
maxval = max_val
)
)
)
x = 'Win(sqrt(GarageArea))'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('GarageArea', 'sqrt(GarageArea)', x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
GarageArea sqrt(GarageArea) Win(sqrt(GarageArea))
Min. :0.006474 Min. :0.001747 Min. :0.001705
1st Qu.:0.139205 1st Qu.:0.121113 1st Qu.:0.121213
Median :0.324645 Median :0.268493 Median :0.264631
Mean :0.317963 Mean :0.287737 Mean :0.286712
3rd Qu.:0.430672 3rd Qu.:0.375451 3rd Qu.:0.372716
Max. :0.873194 Max. :0.875112 Max. :0.876250
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
GarageArea sqrt(GarageArea) Win(sqrt(GarageArea))
Min. :0.002476 Min. :0.002494 Min. :0.002863
1st Qu.:0.128551 1st Qu.:0.144903 1st Qu.:0.145087
Median :0.337145 Median :0.322871 Median :0.319261
Mean :0.315598 Mean :0.315749 Mean :0.314570
3rd Qu.:0.456501 3rd Qu.:0.470921 3rd Qu.:0.470155
Max. :0.819002 Max. :0.837390 Max. :0.840302
NA's :1 NA's :1 NA's :1
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
Transforming and Winsorizing doesn’t help the overall correlation to SalePrice. Let’s facet it out and see.
df = filter(
val_train_Xy,
(GarageType %in% c('Attchd', 'Detchd', 'BuiltIn')) &
(GarageCars != 4) # Ony one point in this subset with 4 and it doesn't really change the correlations, but it saves viz space.
)
ggplot(df, aes(x = `Win(sqrt(GarageArea))`, y = `log(SalePrice)`)) +
geom_jitter(alpha = 0.5) +
geom_smooth(method = 'lm') +
facet_grid(rows = vars(GarageType), cols = vars(GarageCars.fact))
sum_df0 = df %>%
summarize(
n = n(),
GarageArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`GarageArea`
),
sqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`sqrt(GarageArea)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
)
min_n = 30
# sum_df1 = df %>%
# group_by(GarageCars.fact, GarageType) %>%
# summarize(
# n = n(),
# GarageArea_cor = cor(
# x = .data$`Win(log(SalePrice))`,
# y = .data$`GarageArea`
# ),
# sqrtArea_cor = cor(
# x = .data$`Win(log(SalePrice))`,
# y = .data$`sqrt(GarageArea)`
# ),
# WinsqrtArea_cor = cor(
# x = .data$`Win(log(SalePrice))`,
# y = .data$`Win(sqrt(GarageArea))`
# )
# ) %>%
# filter(n >= min_n)
sum_df2 = df %>% group_by(GarageType) %>%
summarize(
n = n(),
GarageArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`GarageArea`
),
sqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`sqrt(GarageArea)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
) %>%
filter(n >= min_n)
sum_df3 = df %>% group_by(GarageCars.fact) %>%
summarize(
n = n(),
GarageArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`GarageArea`
),
sqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`sqrt(GarageArea)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
) %>%
filter(n >= min_n)
sum_df0 %>%
merge(y = sum_df2, all = T) %>%
merge(y = sum_df3, all = T) %>%
# merge(y = sum_df1, all = T) %>%
select(
c('GarageType', 'GarageCars.fact', 'n', 'GarageArea_cor',
'sqrtArea_cor', 'WinsqrtArea_cor')
) %>%
arrange(GarageType, GarageCars.fact)
Transforming and Winsorizing GarageArea improves correlation to the target variable (“Win(log(SalePrice))”) somewhat when excluding houses without garages. Grouping by garage type helps in some cases.
Whether to transform the variable or not may depend on which ML algorithm we’re using and how. A decision tree will likely be indifferent to tranformations, though it may benefit from noise reduction with Winsorization. A linear regression will only benefit if type and/or missingness is factored in, as would KNN.
Grouping by number of cars lowers the area:price correlation to no correlation. And, in fact, number of cars has a stronger correlation to the target variable than area does. Let’s see how grouping by type further improves that correlation.
df = filter(
val_train_Xy,
(GarageType %in% c('Attchd', 'Detchd', 'BuiltIn'))
)
ggplot(df, aes(x = GarageCars, y = `log(SalePrice)`)) +
geom_jitter(alpha = 0.5) +
geom_smooth(method = 'lm') +
facet_wrap(facets = vars(GarageType))
sum_df0 = df %>%
summarize(
n = n(),
Cars_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$GarageCars
),
WinCars_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(GarageCars)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
)
sum_df2 = df %>% group_by(GarageType) %>%
summarize(
n = n(),
Cars_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$GarageCars
),
WinCars_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(GarageCars)`
),
WinsqrtArea_cor = cor(
x = .data$`Win(log(SalePrice))`,
y = .data$`Win(sqrt(GarageArea))`
)
)
sum_df0 %>%
merge(y = sum_df2, all = T) %>%
select(
c('GarageType', 'n', 'Cars_cor', 'WinCars_cor', 'WinsqrtArea_cor')
) %>%
arrange(GarageType)
It looks worth dropping GarageArea in favor of GarageCars. There is an argument against linear regression using discrete variables, but it seems to work nonetheless.
Winsorizing GarageCars only produces a marginal benefit which may prove spurious as it only adjusts one point. Plus, it reduces normality. So, I’ll just use the raw feature.
There are likely other features at play, like year built, that also affect things like the difference in the prices of two-car attached garages and two-car detached garages.
fenced_jbv(
data = df[df$GarageCars != 4, ],
x = 'GarageType',
y = 'log(SalePrice)',
jit_col = 'YearBuilt.fact',
leg_lbl = 'Year Built'
) +
facet_wrap(facets = vars(GarageCars.fact))
val_train_Xy = select(
val_train_Xy,
-c('GarageCars.fact', 'GarageCars.plus1', 'GarageCars.bin',
'GarageCars.bin.fact', 'GarageArea', 'sqrt(GarageArea)',
'Win(sqrt(GarageArea))')
)
summary(val_train_Xy[ , c('GarageQual', 'GarageCond')])
GarageQual GarageCond
None: 40 None: 40
Po : 1 Po : 4
Fa : 29 Fa : 20
TA :633 TA :646
Gd : 11 Gd : 4
Ex : 1 Ex : 1
val_train_Xy = select(val_train_Xy, -c('GarageQual', 'GarageCond'))
summary(val_train_Xy$PavedDrive)
None N P Y
0 53 19 643
val_train_Xy = select(val_train_Xy, -c('PavedDrive'))
x = 'WoodDeckSF'
summary(val_train_Xy[x])
WoodDeckSF
Min. : 0.00
1st Qu.: 0.00
Median : 0.00
Mean : 96.78
3rd Qu.:175.50
Max. :736.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$WoodDeckSF != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 10,
t_binw = .1
)
NULL
x = 'cbrt(WoodDeckSF)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(WoodDeckSF)' = (WoodDeckSF)^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
cbrt(WoodDeckSF)
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :2.712
3rd Qu.:5.599
Max. :9.029
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = .1
)
NULL
df = filter(val_train_Xy, WoodDeckSF != 0)
qqnorm(y = df$WoodDeckSF, ylab = 'WoodDeckSF')
qqline(y = df$WoodDeckSF, ylab = 'WoodDeckSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_cbrt_x = Winsorize(
x = df[[x]],
probs = c(0.01, 0.99),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = x)
qqline(y = Win_cbrt_x, ylab = x)
Win_raw_x = Winsorize(
x = df$WoodDeckSF,
probs = c(0, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'WoodDeckSF')
qqline(y = Win_raw_x, ylab = 'WoodDeckSF')
print(shapiro.test(x = df$WoodDeckSF))
Shapiro-Wilk normality test
data: df$WoodDeckSF
W = 0.88365, p-value = 1.951e-15
print(shapiro.test(x = df[[x]]))
Shapiro-Wilk normality test
data: df[[x]]
W = 0.98712, p-value = 0.003914
print(shapiro.test(x = Win_cbrt_x))
Shapiro-Wilk normality test
data: Win_cbrt_x
W = 0.98648, p-value = 0.002765
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.93225, p-value = 2.343e-11
val_train_Xy = val_train_Xy %>%
mutate(
'Win(cbrt(WoodDeckSF))' = ifelse(
WoodDeckSF == 0,
0,
Winsorize(
WoodDeckSF^(1/3),
minval = min(Win_cbrt_x),
maxval = max(Win_cbrt_x)
)
)
) %>%
mutate(
'Win(WoodDeckSF)' = ifelse(
WoodDeckSF == 0,
0,
Winsorize(
WoodDeckSF,
minval = min(Win_raw_x),
maxval = max(Win_raw_x)
)
)
)
val_train_Xy = val_train_Xy %>%
mutate('WoodDeck.bin' = ifelse(WoodDeckSF == 0, 0, 1)) %>%
mutate('WoodDeck.bin.fact' = factor(WoodDeck.bin, ordered = T))
x = 'WoodDeck.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "0" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('WoodDeckSF', 'cbrt(WoodDeckSF)', 'Win(cbrt(WoodDeckSF))',
'Win(WoodDeckSF)', 'WoodDeck.bin')
x = 'Win(cbrt(WoodDeckSF))'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
WoodDeckSF cbrt(WoodDeckSF) Win(cbrt(WoodDeckSF))
Min. :0.006145 Min. :0.004212 Min. :0.004404
1st Qu.:0.084695 1st Qu.:0.069548 1st Qu.:0.069704
Median :0.176322 Median :0.161833 Median :0.161822
Mean :0.160453 Mean :0.165207 Mean :0.165111
3rd Qu.:0.234533 3rd Qu.:0.252250 3rd Qu.:0.251680
Max. :0.328898 Max. :0.355420 Max. :0.355488
Win(WoodDeckSF) WoodDeck.bin
Min. :0.001863 Min. :0.002522
1st Qu.:0.082133 1st Qu.:0.055760
Median :0.179367 Median :0.161167
Mean :0.163279 Mean :0.149999
3rd Qu.:0.242096 3rd Qu.:0.228064
Max. :0.339011 Max. :0.330345
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
WoodDeckSF cbrt(WoodDeckSF) Win(cbrt(WoodDeckSF))
Min. :0.001777 Min. :0.002974 Min. :0.004668
1st Qu.:0.060621 1st Qu.:0.056354 1st Qu.:0.056724
Median :0.112829 Median :0.144985 Median :0.142206
Mean :0.114102 Mean :0.123596 Mean :0.124123
3rd Qu.:0.157044 3rd Qu.:0.172657 3rd Qu.:0.169070
Max. :0.285351 Max. :0.298849 Max. :0.302455
Win(WoodDeckSF) WoodDeck.bin
Min. :0.001231 Min. : NA
1st Qu.:0.060845 1st Qu.: NA
Median :0.122578 Median : NA
Mean :0.116930 Mean :NaN
3rd Qu.:0.155611 3rd Qu.: NA
Max. :0.304426 Max. : NA
NA's :55
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
# plot_scat_pairs(
# df = filter(val_train_Xy, WoodDeckSF != 0),
# x = feat,
# y_lst = y_lst
# )
}
There’s no real correlation between deck square footage and the target price when you remove houses without decks; the binary version does most of the work. But, the transformation does it a little better and does offer more normalized distance between points for KNN.
x = 'Win(cbrt(WoodDeckSF))'
min_val = min(Win_cbrt_x)
max_val = max(Win_cbrt_x)
print(paste("min_val:", min_val))
[1] "min_val: 2.99287415882937"
print(paste("max_val:", max_val))
[1] "max_val: 8.48252791006478"
# Already hard coded above
val_train_Xy = select(
val_train_Xy,
-c('WoodDeck.bin', 'WoodDeck.bin.fact', 'Win(WoodDeckSF)')
)
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = .25)
x = 'OpenPorchSF'
summary(val_train_Xy[x])
OpenPorchSF
Min. : 0.00
1st Qu.: 0.00
Median : 27.00
Mean : 47.46
3rd Qu.: 72.00
Max. :547.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$OpenPorchSF != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 10,
t_binw = 0.1
)
NULL
x = 'cbrt(OpenPorchSF)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(OpenPorchSF)' = OpenPorchSF^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
cbrt(OpenPorchSF)
Min. :0.000
1st Qu.:0.000
Median :3.000
Mean :2.276
3rd Qu.:4.160
Max. :8.178
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.1,
t_binw = 0.1
)
NULL
df = filter(val_train_Xy, OpenPorchSF != 0)
x = 'cbrt(OpenPorchSF)'
qqnorm(y = df$OpenPorchSF, ylab = 'OpenPorchSF')
qqline(y = df$OpenPorchSF, ylab = 'OpenPorchSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_cbrt_x = Winsorize(
x = df[[x]],
probs = c(0.001, 0.994),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = x)
qqline(y = Win_cbrt_x, ylab = x)
Win_raw_x = Winsorize(
x = df$OpenPorchSF,
probs = c(0, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'OpenPorchSF')
qqline(y = Win_raw_x, ylab = 'OpenPorchSF')
print(shapiro.test(x = df$OpenPorchSF))
Shapiro-Wilk normality test
data: df$OpenPorchSF
W = 0.79866, p-value < 2.2e-16
print(shapiro.test(x = df[[x]]))
Shapiro-Wilk normality test
data: df[[x]]
W = 0.97116, p-value = 5.932e-07
print(shapiro.test(x = Win_cbrt_x))
Shapiro-Wilk normality test
data: Win_cbrt_x
W = 0.97391, p-value = 1.921e-06
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.89182, p-value = 6.251e-16
val_train_Xy = val_train_Xy %>%
mutate(
'Win(cbrt(OpenPorchSF))' = ifelse(
OpenPorchSF == 0,
0,
Winsorize(
OpenPorchSF^(1/3),
# probs = c(0.001, 0.994),
# na.rm = T
minval = min(Win_cbrt_x),
maxval = max(Win_cbrt_x)
)
)
)
Looks like some polymodality happening. Year built doesn’t seem to explain it. I’m not going to manually search for it any further, but a decision tree or other ML algorithm may find and “factor” in the hidden interaction.
ggplot(
filter(val_train_Xy, OpenPorchSF != 0),
aes(x = `cbrt(OpenPorchSF)`)
) +
geom_histogram(binwidth = 0.1) +
facet_wrap(facets = vars(YearBuilt.fact), ncol = 1)
ggplot(
filter(val_train_Xy, OpenPorchSF != 0),
aes(x = `cbrt(OpenPorchSF)`, y = `log(SalePrice)`)
) +
geom_point(aes(color = YearBuilt))
val_train_Xy = val_train_Xy %>%
mutate('OpenPorch.bin' = ifelse(OpenPorchSF == 0, 0, 1)) %>%
mutate('OpenPorch.bin.fact' = factor(OpenPorch.bin, ordered = T))
x = 'OpenPorch.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "1" "0"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('OpenPorchSF', 'cbrt(OpenPorchSF)', 'Win(cbrt(OpenPorchSF))',
'OpenPorch.bin')
x = 'Win(cbrt(OpenPorchSF))'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
OpenPorchSF cbrt(OpenPorchSF) Win(cbrt(OpenPorchSF))
Min. :0.004127 Min. :0.003166 Min. :0.002332
1st Qu.:0.059251 1st Qu.:0.090986 1st Qu.:0.091328
Median :0.142723 Median :0.167835 Median :0.168485
Mean :0.151759 Mean :0.202395 Mean :0.202442
3rd Qu.:0.232678 3rd Qu.:0.325809 3rd Qu.:0.326224
Max. :0.339325 Max. :0.460438 Max. :0.459971
OpenPorch.bin
Min. :0.000127
1st Qu.:0.082138
Median :0.152291
Mean :0.203538
3rd Qu.:0.356361
Max. :0.465546
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
OpenPorchSF cbrt(OpenPorchSF) Win(cbrt(OpenPorchSF))
Min. :0.002912 Min. :0.0005588 Min. :0.001252
1st Qu.:0.037026 1st Qu.:0.0371173 1st Qu.:0.033447
Median :0.070241 Median :0.0705708 Median :0.067662
Mean :0.074408 Mean :0.0750998 Mean :0.073149
3rd Qu.:0.106605 3rd Qu.:0.1177230 3rd Qu.:0.112676
Max. :0.192231 Max. :0.2019503 Max. :0.191297
OpenPorch.bin
Min. : NA
1st Qu.: NA
Median : NA
Mean :NaN
3rd Qu.: NA
Max. : NA
NA's :57
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
plot_scat_pairs(
df = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = feat,
y_lst = y_lst
)
}
In this case, the binary variable is doing all the work.
val_train_Xy = select(
val_train_Xy,
-c('cbrt(OpenPorchSF)', 'Win(cbrt(OpenPorchSF))', 'OpenPorch.bin.fact')
)
x = 'EnclosedPorch'
summary(val_train_Xy[x])
EnclosedPorch
Min. : 0.00
1st Qu.: 0.00
Median : 0.00
Mean : 19.09
3rd Qu.: 0.00
Max. :294.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$EnclosedPorch != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 5,
t_binw = 5
)
NULL
df = filter(val_train_Xy, EnclosedPorch != 0)
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_raw_x = Winsorize(
x = df$EnclosedPorch,
probs = c(0.001, 0.999),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = df[[x]]))
Shapiro-Wilk normality test
data: df[[x]]
W = 0.98306, p-value = 0.2587
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.9827, p-value = 0.2437
val_train_Xy = val_train_Xy %>%
mutate('EnclosedPorch.bin' = ifelse(EnclosedPorch == 0, 0, 1)) %>%
mutate(
'EnclosedPorch.bin.fact' = factor(
EnclosedPorch.bin,
ordered = T
)
)
x = 'EnclosedPorch.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "0" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('EnclosedPorch', 'EnclosedPorch.bin')
x = 'EnclosedPorch'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
EnclosedPorch EnclosedPorch.bin
Min. :0.004006 Min. :0.001413
1st Qu.:0.025521 1st Qu.:0.043374
Median :0.069574 Median :0.095328
Mean :0.087443 Mean :0.113655
3rd Qu.:0.138839 3rd Qu.:0.165249
Max. :0.371542 Max. :0.458928
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = c(x),
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
EnclosedPorch
Min. :0.005257
1st Qu.:0.089806
Median :0.169637
Mean :0.184809
3rd Qu.:0.248067
Max. :0.474650
NA's :3
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
plot_scat_pairs(
df = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = feat,
y_lst = y_lst
)
}
Houses with enclosed porches are significantly cheaper than those without, maybe due to a confounding variable like year built. There is a weak, if existent, positive correlation between square footage and price within those that have them.
fenced_jbv(
data = val_train_Xy,
x = 'EnclosedPorch.bin.fact',
y = 'log(SalePrice)',
jit_col = 'YearBuilt',
jit_alpha = 1
) +
facet_wrap(facets = vars(YearBuilt.fact))
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `sqrt(Age)`,
val_train_Xy
)$residuals
ggplot(val_train_Xy, aes(x = EnclosedPorch, y = resids)) +
geom_point() +
geom_smooth(method = 'lm')
val_train_Xy = select(
val_train_Xy, -c('EnclosedPorch.bin', 'EnclosedPorch.bin.fact')
)
summary(val_train_Xy$X3SsnPorch)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 4.505 0.000 508.000
val_train_Xy = select(val_train_Xy, -c('X3SsnPorch'))
x = 'ScreenPorch'
summary(val_train_Xy[x])
ScreenPorch
Min. : 0.00
1st Qu.: 0.00
Median : 0.00
Mean : 16.48
3rd Qu.: 0.00
Max. :480.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$ScreenPorch != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 10,
t_binw = 0.1
)
NULL
x = 'cbrt(ScreenPorch)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(ScreenPorch)' = ScreenPorch^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
cbrt(ScreenPorch)
Min. :0.0000
1st Qu.:0.0000
Median :0.0000
Mean :0.4832
3rd Qu.:0.0000
Max. :7.8297
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$`cbrt(ScreenPorch)` != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.1,
t_binw = 0.1
)
NULL
x = 'cbrt(ScreenPorch)'
df = filter(val_train_Xy, ScreenPorch != 0)
qqnorm(y = df$ScreenPorch, ylab = 'ScreenPorch')
qqline(y = df$ScreenPorch)
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]])
Win_cbrt_x = Winsorize(
df[[x]],
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = 'Win_cbrt_x')
qqline(y = Win_cbrt_x)
Win_raw_x = Winsorize(
df$ScreenPorch,
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x)
print(shapiro.test(x = df$ScreenPorch))
Shapiro-Wilk normality test
data: df$ScreenPorch
W = 0.92914, p-value = 0.001652
print(shapiro.test(x = df[[x]]))
Shapiro-Wilk normality test
data: df[[x]]
W = 0.97514, p-value = 0.2487
print(shapiro.test(x = Win_cbrt_x))
Shapiro-Wilk normality test
data: Win_cbrt_x
W = 0.96719, p-value = 0.1008
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.92848, p-value = 0.001547
val_train_Xy = val_train_Xy %>%
mutate(
'Win(cbrt(ScreenPorch))' = ifelse(
ScreenPorch == 0,
0,
Winsorize(
ScreenPorch^(1/3),
minval = min(Win_cbrt_x),
maxval = max(Win_cbrt_x)
)
)
)
val_train_Xy = val_train_Xy %>%
mutate('ScreenPorch.bin' = ifelse(ScreenPorch ==0, 0, 1)) %>%
mutate('ScreenPorch.bin.fact' = factor(ScreenPorch.bin, ordered = T))
x = 'ScreenPorch.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "0" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('ScreenPorch', 'cbrt(ScreenPorch)', 'Win(cbrt(ScreenPorch))',
'ScreenPorch.bin')
x = 'Win(cbrt(ScreenPorch))'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
ScreenPorch cbrt(ScreenPorch) Win(cbrt(ScreenPorch))
Min. :0.000127 Min. :0.001702 Min. :0.0007076
1st Qu.:0.033205 1st Qu.:0.032575 1st Qu.:0.0310287
Median :0.052721 Median :0.054455 Median :0.0552326
Mean :0.062430 Mean :0.060759 Mean :0.0601016
3rd Qu.:0.074751 3rd Qu.:0.083482 3rd Qu.:0.0840852
Max. :0.225009 Max. :0.226688 Max. :0.2256480
ScreenPorch.bin
Min. :0.001074
1st Qu.:0.026853
Median :0.056082
Mean :0.057789
3rd Qu.:0.086152
Max. :0.220821
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
ScreenPorch cbrt(ScreenPorch) Win(cbrt(ScreenPorch))
Min. :0.01673 Min. :0.003906 Min. :0.0138
1st Qu.:0.09574 1st Qu.:0.106601 1st Qu.:0.1020
Median :0.14405 Median :0.143076 Median :0.1361
Mean :0.14621 Mean :0.159218 Mean :0.1533
3rd Qu.:0.20818 3rd Qu.:0.233239 3rd Qu.:0.2118
Max. :0.37341 Max. :0.325564 Max. :0.2796
NA's :1 NA's :1 NA's :1
ScreenPorch.bin
Min. : NA
1st Qu.: NA
Median : NA
Mean :NaN
3rd Qu.: NA
Max. : NA
NA's :57
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
plot_scat_pairs(
df = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = feat,
y_lst = y_lst
)
}
The binary doesn’t seem to make a significant difference as other binaries like WoodDeck and OpenPorch. So, it’s not worth using alone, but might be useful in interaction with the area in a linear regression. Decision trees should be able to do without the binary feature. The transformation may help KNN, but like a decision tree, I would want to avoid overweighting by including both.
fenced_jbv(
data = val_train_Xy,
x = 'ScreenPorch.bin.fact',
y = 'log(SalePrice)',
jit_col = 'YearBuilt',
jit_alpha = 1
) +
facet_wrap(facets = vars(YearBuilt.fact))
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `sqrt(Age)`,
val_train_Xy
)$residuals
ggplot(val_train_Xy, aes(x = `cbrt(ScreenPorch)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm')
print("Correlation of cbrt(ScreenPorch) to residuals of `log(SalePrice)` ~ `sqrt(Age)`")
[1] "Correlation of cbrt(ScreenPorch) to residuals of `log(SalePrice)` ~ `sqrt(Age)`"
print(cor(x = val_train_Xy$`cbrt(ScreenPorch)`, y = val_train_Xy$resids))
[1] 0.2266884
df = filter(val_train_Xy, ScreenPorch != 0)
ggplot(df, aes(x = `cbrt(ScreenPorch)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm')
print("Exluding ScreenPorch 0s:")
[1] "Exluding ScreenPorch 0s:"
print(cor(x = val_train_Xy$`cbrt(ScreenPorch)`, y = val_train_Xy$resids))
[1] 0.2266884
This feature doesn’t seem to have much to offer. But, I’ll leave it anyway and let feature selection during modeling suss that out.
val_train_Xy = select(
val_train_Xy, -c('ScreenPorch.bin.fact', 'Win(cbrt(ScreenPorch))')
)
summary(val_train_Xy$PoolQC)
None Po Fa TA Gd Ex
712 0 1 0 1 1
val_train_Xy = select(val_train_Xy, -c('PoolArea', 'PoolQC'))
x = 'Fence'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "None" "MnPrv"
val_train_Xy = val_train_Xy %>%
mutate('Fence.bin' = ifelse(Fence == 'None', 0, 1)) %>%
mutate('Fence.bin.fact' = factor(Fence.bin, ordered = T))
x = 'Fence.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "0" "1"
Having a Fence seems to detract value, probably due to an interaction with another variable as we saw with EnclosedPorch and age. Unlike EnclosedPorch, it’s not immediately obvious what the other variable is, and “controlling” for it with a linear regression may actually increase Fence’s significance. So, rather than hunting for the other variable(s) or dropping this one, I’ll leave it and see if ML modeling can make use of it.
val_train_Xy = select(
val_train_Xy,
-c('Fence.bin', 'Fence.bin.fact')
)
MiscVal is kind of a cheater variable. It should have precisely 1 for its coefficient. Otherwise, I would just drop it for so few observations; it might just throw of the regression. If I keep it, it should be transformed in the same way that the target variable is.
It also looks like the presence of a miscellaneous improvement is associated with a lower price if anything.
x = 'MiscFeature'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
character(0)
x = 'MiscVal'
summary(val_train_Xy[x])
MiscVal
Min. : 0.00
1st Qu.: 0.00
Median : 0.00
Mean : 54.19
3rd Qu.: 0.00
Max. :15500.00
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$MiscVal != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 200,
t_binw = .1
)
NULL
x = 'log2(MiscVal)'
val_train_Xy = val_train_Xy %>%
mutate(
'log2(MiscVal)' = ifelse(
MiscVal == 0,
0,
log2(MiscVal)
)
)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
log2(MiscVal)
Min. : 0.0000
1st Qu.: 0.0000
Median : 0.0000
Mean : 0.2708
3rd Qu.: 0.0000
Max. :13.9200
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$`log2(MiscVal)` != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = .1,
t_binw = .1
)
NULL
Alright, I’ll give it a natural log like SalePrice, since it is a straight dollar value.
x = 'log(MiscVal)'
val_train_Xy = val_train_Xy %>%
mutate(
'log(MiscVal)' = ifelse(
MiscVal == 0,
0,
log(MiscVal)
)
)
df = filter(val_train_Xy, MiscVal != 0)
gg = ggplot(df, aes(x = `log(MiscVal)`))
p1 = gg + geom_histogram(binwidth = 0.1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
Since this variable is an actual dollar value, Winsorizing it doesn’t really make sense. I’ll check it out anyway.
qqnorm(y = df$MiscVal, ylab = 'MiscVal')
qqline(y = df$MiscVal)
qqnorm(y = df$`log(MiscVal)`, ylab = 'log(MiscVal)')
qqline(y = df$`log(MiscVal)`)
Win_log_x = Winsorize(
df$`log(MiscVal)`,
probs = c(0.007, 0.95),
na.rm = T
)
qqnorm(y = Win_log_x, ylab = 'Win_log_x')
qqline(y = Win_log_x)
Win_raw_x = Winsorize(
df$MiscVal,
probs = c(0, 0.95)
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x)
print(shapiro.test(x = df$MiscVal))
Shapiro-Wilk normality test
data: df$MiscVal
W = 0.49238, p-value = 2.714e-07
print(shapiro.test(x = df$`log(MiscVal)`))
Shapiro-Wilk normality test
data: df$`log(MiscVal)`
W = 0.88923, p-value = 0.02603
print(shapiro.test(x = Win_log_x))
Shapiro-Wilk normality test
data: Win_log_x
W = 0.89412, p-value = 0.03203
print(shapiro.test(x = Win_raw_x))
Shapiro-Wilk normality test
data: Win_raw_x
W = 0.5586, p-value = 1.129e-06
val_train_Xy = val_train_Xy %>%
mutate(
'Win(log(MiscVal))' = ifelse(
MiscVal == 0,
0,
Winsorize(
log(MiscVal),
minval = min(Win_log_x),
maxval = max(Win_log_x)
)
)
)
val_train_Xy = val_train_Xy %>%
mutate('MiscVal.bin' = ifelse(MiscVal == 0, 0, 1)) %>%
mutate('MiscVal.bin.fact' = factor(MiscVal.bin, ordered = T))
x = 'MiscVal.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
character(0)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('MiscVal', 'log2(MiscVal)', 'log(MiscVal)',
'Win(log(MiscVal))', 'MiscVal.bin')
x = 'log(MiscVal)'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
[1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
MiscVal log2(MiscVal) log(MiscVal)
Min. :0.001150 Min. :0.001803 Min. :0.001803
1st Qu.:0.006437 1st Qu.:0.013649 1st Qu.:0.013649
Median :0.013835 Median :0.028586 Median :0.028586
Mean :0.018486 Mean :0.032804 Mean :0.032804
3rd Qu.:0.025099 3rd Qu.:0.047086 3rd Qu.:0.047086
Max. :0.083097 Max. :0.087143 Max. :0.087143
Win(log(MiscVal)) MiscVal.bin
Min. :0.002117 Min. :0.00218
1st Qu.:0.013673 1st Qu.:0.01782
Median :0.028980 Median :0.03383
Mean :0.032855 Mean :0.03774
3rd Qu.:0.046462 3rd Qu.:0.05123
Max. :0.086933 Max. :0.09574
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
[1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
MiscVal log2(MiscVal) log(MiscVal)
Min. :0.006626 Min. :0.009576 Min. :0.009576
1st Qu.:0.050062 1st Qu.:0.121361 1st Qu.:0.121361
Median :0.123861 Median :0.233112 Median :0.233112
Mean :0.139666 Mean :0.254203 Mean :0.254203
3rd Qu.:0.190636 3rd Qu.:0.369457 3rd Qu.:0.369457
Max. :0.458312 Max. :0.590246 Max. :0.590246
Win(log(MiscVal)) MiscVal.bin
Min. :0.001031 Min. : NA
1st Qu.:0.114223 1st Qu.: NA
Median :0.264792 Median : NA
Mean :0.270993 Mean :NaN
3rd Qu.:0.415090 3rd Qu.: NA
Max. :0.610388 Max. : NA
NA's :58
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
plot_scat_pairs(
df = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = feat,
y_lst = y_lst
)
}
MiscVal appears to be correlated with the size of the lot and house. Perhaps that will do the heavy lifting and make MiscVal obsolete.
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `Win(LotArea)`,
val_train_Xy
)$residuals
print("Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `Win(LotArea)`")
[1] "Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `Win(LotArea)`"
print(cor(x = val_train_Xy$`log(MiscVal)`, y = val_train_Xy$resids))
[1] -0.08294709
ggplot(val_train_Xy, aes(x = `log(MiscVal)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = "`log(SalePrice)` ~ `Win(LotArea)`")
df = filter(val_train_Xy, MiscVal != 0)
print("Exluding MiscVal 0s:")
[1] "Exluding MiscVal 0s:"
print(cor(x = df$`log(MiscVal)`, y = df$resids))
[1] 0.4032275
ggplot(df, aes(x = `log(MiscVal)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = "`log(SalePrice)` ~ `Win(LotArea)`")
val_train_Xy$resids = lm(
`log(SalePrice)` ~ `log10(log10(LotArea))`,
val_train_Xy
)$residuals
print("Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `log10(log10(LotArea))`")
[1] "Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `log10(log10(LotArea))`"
print(cor(x = val_train_Xy$`log(MiscVal)`, y = val_train_Xy$resids))
[1] -0.0788646
ggplot(val_train_Xy, aes(x = `log(MiscVal)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = "`log(SalePrice)` ~ `log10(log10(LotArea))`")
df = filter(val_train_Xy, MiscVal != 0)
print("Exluding MiscVal 0s:")
[1] "Exluding MiscVal 0s:"
print(cor(x = df$`log(MiscVal)`, y = df$resids))
[1] 0.4945909
ggplot(df, aes(x = `log(MiscVal)`, y = resids)) +
geom_point() +
geom_smooth(method = 'lm') +
ylab(label = "`log(SalePrice)` ~ `log10(log10(LotArea))`")
After factoring in Lot Area, there’s still some correlation between MiscVal and the target once 0s are removed. It may still be worth keeping. We’ll let modeling suss that out.
val_train_Xy = select(
val_train_Xy, -c('MiscVal.bin', 'MiscVal.bin.fact', 'log2(MiscVal)')
)
MoSold somewhat normally distributed around June/July. Conventional wisdom says that summer sales are higher in volume and price, but the data don’t bear that out for price.
Records go from January 2006 through July 2010, so August-December are underrepresented by roughly 20%.
YrSold pretty uniform (about 140-160) except for 2010 which ended in July in this set.
Interesting spike in sales in Spring 2010. Foreclosures coming onto market?
val_train_Xy = val_train_Xy %>%
mutate('MoSold.fact' = factor(MoSold)) %>%
mutate('YrSold.fact' = factor(YrSold, ordered = T))
x = 'MoSold.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "1" "7" "5" "9" "12"
x = 'YrSold.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
character(0)
val_train_Xy = val_train_Xy %>%
mutate(
SoldDate = as.Date(
paste(
as.character(YrSold),
as.character(MoSold),
'15',
sep = '/'
),
format = '%Y/%m/%d'
)
)
ggplot(val_train_Xy, aes(x = SoldDate)) +
geom_bar()
x = 'SoldDate'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('SoldDate')
#
# df = get_cors(
# data = filter(
# select(val_train_Xy, all_of(num_feats)),
# !is.na(.data[[x]])
# ),
# x_lst = x_lst,
# feats = num_feats
# )
# df
# print("Summary of absolute values of Pearson's Rs:")
# df = abs(df)
# summary(abs(df))
#
# df = melt(df)
# ggplot(df, aes(x = variable, y = value)) +
# geom_boxplot(notch = T) +
# ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
NULL
ggplot(
val_train_Xy,
aes(x = SoldDate, y = `log(SalePrice)`)
) +
geom_point() +
geom_smooth() +
geom_smooth(method = 'lm', color = 'Yellow') +
facet_grid(rows = vars(SaleCondition), cols = vars(SaleType))
None of date seems to matter in this set. Drop it, but no yet.
x = 'SaleType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "WD" "New"
df = val_train_Xy[val_train_Xy$SaleType %in% c('WD', 'New', 'COD'), ]
gg = ggplot(df, aes(x = SaleType))
gg + geom_bar() +
facet_grid(rows = vars(MoSold), cols = vars(YrSold)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
gg + geom_bar(position = 'dodge', aes(fill = factor(YrSold)))
ggplot(df, aes(color = SaleType, x = SoldDate)) +
geom_freqpoly()
# ggplot(df, aes(x = SaleType, y = SalePrice)) +
# geom_col(position = 'dodge', aes(fill = factor(YrSold), stat = 'mean'))
ggplot(df, aes(x = SaleType, y = `log(SalePrice)`)) +
geom_boxplot(position = 'dodge', notch = T, aes(color = factor(YrSold)))
You can see new home sales drop off with the crash. Court officer deeds also ticked up, possibly due to more foreclosures. But, there didn’t seem to be a significant difference in sale prices year over year within sale type groups, except between new sales in 2007and 2010 which is not fully represented in this set.
592 normal, 61 partial (home not completed when last assessed), 43 abnormal (trade, foreclosure, short sale), 11 family. I’m guessing a family sale will be lower in price typically, as will foreclosures and shortsales. I’m not sure what to make of partials; the house wasn’t fully built when assessed, so the price may be askew?
Surprisingly, abnormal sales didn’t seem to vary with the crash. Ames appears to have fared well.
x = 'SaleCondition'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
[1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
[1] "Normal" "Abnorml" "Partial"
df = filter(val_train_Xy, SaleCondition %in% c('Normal', 'Abnorml', 'Partial'))
gg = ggplot(df, aes(x = SaleCondition))
gg + geom_bar() +
facet_grid(rows = vars(MoSold), cols = vars(YrSold)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
gg + geom_bar(aes(fill = factor(YrSold)), position = 'dodge')
ggplot(df, aes(x = SoldDate, color = SaleCondition)) +
geom_freqpoly()
ggplot(df, aes(x = SaleCondition, y = `log(SalePrice)`)) +
geom_boxplot(position = 'dodge', notch = T, aes(color = factor(YrSold)))
Bearing in mind that 2010 is not a complete year in this set, partial sales dropped as normal sales increased. This trend may be explained by developers finishing/halting their projects. Filtering/grouping by year built and/or neighborhood might help check this, but I’ll skip it for the sake of finishing.
Fall and winter months seemed to be where the bulk of these increases in normal sales fell each year.
To wrap up this notebook, and as a preliminary gauge on how well I have prepared the data for ML, mainly for linear regression, I’ll check out how correlations have changed. Have my variables increased in correlation in general? Have they increased in correlation to the target variable?
Increased correlations to the target variable have obvious benefits. Increased correlations between predictor variables will help clarify which variables may overlap in their predictive power and redundantly overweight the same underlying information.
I made a lot of new features, dropping some along the way. In cases where a Winsorized version seemed a good option, I also kept the scale-transformed version where applicable. For example, I kept both log(SalePrice) and Win(log(SalePrice)). Winsorization will help a straightforward linear regression without interactions. But, Winsorization will undercut KNN’s ability to cluster multivariate outliers and similarly RF’s ability to group by extremes. That said, Winsorization will reduce the chances of overfit in all three. All that is to say that there are redundant new features.
I can’t think of a quick and dirty way to view this without getting skewed results or busy heatmaps, other than to make a table of all the variables’ correlations to the target variable which isn’t very easily digested either. The slow and tedious way would be to iteratively “manually” (to some extent) select like variables for correlation within their experimental group. I’m not goin to do that.
val_train_Xy_numeric = select(
val_train_Xy, # Reorder for easier comparison.
c('SalePrice', 'log(SalePrice)', 'Win(log(SalePrice))', "LotFrontage",
"log10(LotFrontage)", "Win(log10(LotFrontage))", "Win(LotFrontage)",
"LotArea", "log10(log10(LotArea))", "Win(LotArea)", "OverallQual_int",
"OverallCond_int", "YearBuilt", "sqrt(Age)", "YearRemodAdd", "MasVnrArea",
"cbrt(MasVnrArea)", "Win(cbrt(MasVnrArea))", "BsmtFinSF1", "BsmtFinSF2",
"BsmtUnfSF", "cbrt(BsmtUnfSF)", "square(log(TotalBsmtSF))",
"Win(square(log(TotalBsmtSF)))", "TotalBsmtSF", "TotalBsmtFinSF",
"sqrt(TotalBsmtFinSF)", "Win(sqrt(TotalBsmtFinSF))", "X2ndFlrSF",
"X2ndFlr.bin", "GrLivArea", "square(log2(GrLivArea))",
"Win(square(log2(GrLivArea)))", "TotBaths", "Win(TotBaths)",
"BedroomAbvGr", "Win(BedroomAbvGr)", "TotRmsAbvGrd", "Win(BedroomAbvGr)",
"TotRmsAbvGrd", "Win(TotRmsAbvGrd)", "Fireplaces.bin", "GarageCars",
"Win(GarageCars)", "WoodDeckSF", "cbrt(WoodDeckSF)",
"Win(cbrt(WoodDeckSF))", "OpenPorchSF", "OpenPorch.bin", "EnclosedPorch",
"ScreenPorch", "cbrt(ScreenPorch)", "ScreenPorch.bin", "MiscVal",
"log(MiscVal)", "Win(log(MiscVal))", "MoSold", "YrSold")
)
ggcorr(val_train_Xy_numeric)
cor_mtx = cor(val_train_Xy_numeric, use = 'pairwise.complete.obs')
target_vars_vec = c('SalePrice', 'log(SalePrice)', 'Win(log(SalePrice))')
cor_mtx_melted = melt(cor_mtx)
sales_cor_mtx_melted = filter(
cor_mtx_melted,
Var1 %in% target_vars_vec & !(Var2 %in% target_vars_vec)
)
ggplot(sales_cor_mtx_melted, aes(x = Var1, y = Var2)) +
geom_tile(aes(fill = value))
dcast(sales_cor_mtx_melted, formula = Var2 ~ Var1)
fenced_jbv(
data = sales_cor_mtx_melted,
x = 'Var1',
y = 'value',
jit_h = 0
)
I’ll write it to a CSV file so I can verify that my final engineering script duplicates this process. I’ll verify in the next notebook before I start modeling.
val_train_Xy$Id = val_train_X$Id
saveRDS(val_train_Xy, 'data/eda_val_train_Xy.rds')
head(readRDS('data/eda_val_train_Xy.rds'))